In [56]:

'''
Author: Damiano Pasquini
email: damiano23@ru.is
'''

# imports and configs
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import accuracy_score

path = './dataset'

In [57]:
def read_data(path):
    """
    Reads the data from the csv file and returns a pandas dataframe
    :param path: path to the csv file
    :return: pandas dataframe
    """
    with open(path+"/api_trace.csv", 'r') as data_file:
        data = [str(line.split(',')) for line in data_file]
    with open(path+"/apt_trace_labels.txt", 'r') as label_file:
        labels = [line for line in label_file]
    return pd.DataFrame({'traces':data,'labels':labels})

In [63]:
def preprocess(df):
    """
    Preprocess the dataframe and returns the X and y for the training and test set
    :param df: pandas dataframe
    :return: X_train, X_test, y_train, y_test
    """
    df.drop_duplicates(subset=['traces'], inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(df['traces'], df['labels'], test_size=0.3, random_state=42)
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    return X_train, X_test, y_train, y_test

In [59]:
def train_classifier(X_train, X_test, y_train, y_test, classifier = MultinomialNB()):
    """
    Trains the classifier and prints the confusion matrix
    :param X_train: training set
    :param X_test: test set
    :param y_train: training labels
    :param y_test: test labels
    :param classifier: classifier to use
    :return: accuracy score
    """
    cf = classifier
    cf.fit(X_train, y_train)
    y_pred = cf.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [62]:
X_train, X_test, y_train, y_test = preprocess(read_data(path))
train_classifier(X_train, X_test, y_train, y_test, classifier = KNeighborsClassifier())

0.45300592718035565