In [53]:

import numpy as np
import os
import scipy.spatial
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA as sklearnPCA
import random
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import warnings

def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False


def convert_stringArrays_to_floatArray(array):
    intArray = []

    for k in array:
        if isfloat(k):
            intArray.append(float(k))
    return intArray


def convert_stringArrays_to_intArray(array):
    intArray = []

    for k in array:
        if isfloat(k):
            intArray.append(int(k))
    return intArray


def readFromFile(path):
    with open(path, 'r') as content_file:
        content = content_file.read()
        return content  # .split(" ")


def createHistFromPkst(pkts):
    numberOfPkts = 1

    hist = [0] * 1515
    largerThanMTU = 0
    for i in range(0, len(pkts)):
        if pkts[i] not in [0, 40, 52]:
            if pkts[i] < len(hist):
                hist[int(pkts[i])] += 1
                numberOfPkts += 1
            else:
                largerThanMTU += 1

    for h in range(0, len(hist)):
        hist[h] /= float(numberOfPkts)
    if largerThanMTU / float((largerThanMTU + numberOfPkts)) < 0.05:
        return hist
    else:
        return "error"


def plot_features(X, y, labels):
    # from sklearn.datasets.samples_generator import make_blobs
    # X, y = make_blobs(n_samples=50, centers=3, n_features=5, random_state=0)
    X_norm = (X - X.min()) / (X.max() - X.min())

    pca = sklearnPCA(n_components=2)  # 2-dimensional PCA
    XPCA = pca.fit_transform(X_norm)
    transformed = pd.DataFrame(pca.fit_transform(X_norm))
    colors = ["red", "blue", "yellow", "pink", "lightgreen"]
    for k in range(len(labels)):
        plt.scatter(transformed[y == k][0], transformed[y == k][1], label=labels[k], c=colors[k], edgecolors=colors[k])
        # plt.scatter(XPCA[:, 0][100 * k:100 * (1 + k)],
        #       XPCA[:, 1][100 * k:100 * (1 + k)], label=labels[k], c=colors[k],
        #        edgecolors=colors[k])
    plt.xlim(-1, 1)
    plt.ylim(-1, 1)
    plt.legend()
    plt.show()

def computerAccuracy(clf, otherTest, bitcoin_test):
    accuracy = 0
    for k in range(0, len(otherTest)):
        if clf.predict(np.array(otherTest[k]).reshape(1, -1))[0] in [0, 2, 3, 4]:
            accuracy += 1
    for k in range(0, len(bitcoin_test)):
        if k%10==0:
            print
        print clf.predict(np.array(bitcoin_test[k]).reshape(1, -1))[0],
        if clf.predict(np.array(bitcoin_test[k]).reshape(1, -1))[0] == 1:
            
            accuracy += 1

    # plotMax(maxOther, maxBitcoin)
    return accuracy / float(len(otherTest) + len(bitcoin_test))

def computeRecall(clf, bitcoin_test):
    truePos = 0
    for k in range(0, len(bitcoin_test)):
        if clf.predict(np.array(bitcoin_test[k]).reshape(1, -1))[0] == 1:
            truePos += 1
    return truePos / float(len(bitcoin_test))

def computePrecision(clf, otherTest, bitcoin_test):
    truePostive = 0
    falsePositive = 0
    for k in bitcoin_test:
        if clf.predict(np.array(k).reshape(1, -1))[0] == 1:
            truePostive += 1

    for k in otherTest:
        if clf.predict(np.array(k).reshape(1, -1))[0] == 1:
            falsePositive += 1
    precsion = truePostive / float(falsePositive + truePostive)
    return precsion


def normalzieVector(X):
    X_norm = []
    for x in X:
        temp = preprocessing.normalize(x.reshape(-1, 1))
        X_norm.extend(temp.reshape(1, -1))
    return X_norm

def readCombinedFeature(directory):
    dataset = []
    files = os.listdir(directory)
    for f in files:
        path = directory + f
        sample = convert_stringArrays_to_floatArray(readFromFile(path).split(" "))
        if len(sample) < 1500:
            print path
            continue
        dataset.append(np.array(sample))
    return np.array(dataset)


def readPacketHist(path):
    files = os.listdir(path)[0:800]
    X_data = []
    for f in files:
        packets = readFromFile(path + f).split(" ")
        pkts = convert_stringArrays_to_floatArray(packets)
        hist = createHistFromPkst(pkts)
        if hist != "error":
            X_data.append(np.array(hist))

    return X_data

def buil_db(array, value):
    data_X = []
    data_Y = []
    n = len(array)
    for i in range(0, n):
        data_X.append(array[i])
        data_Y.append(value)

    return data_X, data_Y





In [48]:

def svm_feature_names(numberOfTraining, dataset_all, names):
    labels = []
    X = []
    y = []
    other_test = []
    bitcoin_test = []
    vals = [0,1,0,0]
    for i in range(0, len(names)):
        http_x, http_y = buil_db(array=dataset_all[i][100:numberOfTraining + 100], value=vals[i])
        X.extend(http_x)
        y.extend(http_y)
        labels.append(names[i])
        if names[i]!='bitcoin':
            other_test.extend(dataset_all[i][0:100])
    y = np.array(y)
    X = np.array(X)
    X_norm = normalzieVector(X)
    
    other_test_norm = normalzieVector(other_test)
    # plot_features(X, y, labels)  # Plot the features to see if they are
    clf = RandomForestClassifier(max_depth=100, n_estimators=100,
                                 random_state=0)  # SVC(kernel='linear', C=1)  # svm.nuVC
    clf.fit(X_norm, y)

    return clf, other_test_norm

In [55]:
dataset_all = []
names = ['http', 'bitcoin', "bittorent", "voip"]
for n in names:
    print n
    dataset = readPacketHist(path='/home/fatemeh/Bitcoin/nov 13/hist/' + n + "/")
    # readCombinedFeature(directory='/home/fatemeh/Bitcoin/combined/' + n + "/features/")
    dataset_all.append(dataset)

http
bitcoin
bittorent
voip


In [None]:
n = 10
clf, otherTest= svm_feature_names(numberOfTraining=n, dataset_all=dataset_all, names=names)

In [51]:
data = readPacketHist(path='/home/fatemeh/Bitcoin/nov 13/hist/noisyBitcoin/')#noisyBitMultipleTab/2/
bitcoin_test=data[0:100]
bitcoin_test= normalzieVector(bitcoin_test)

In [None]:
accuracy = computerAccuracy(clf, otherTest, bitcoin_test)
recall = computeRecall(clf, bitcoin_test)
precision = computePrecision(clf, otherTest, bitcoin_test)
print "F1",accuracy, recall, precision