In [34]:

'''
Author: Damiano Pasquini
email: damiano23@ru.is
'''

# imports and configs
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
path = './dataset'

In [35]:
def read_data(path):
    """
    Reads the data from the csv file and returns a pandas dataframe
    :param path: path to the csv file
    :return: pandas dataframe
    """
    with open(path+"/api_trace.csv", 'r') as data_file:
        data = [str(line.split(',')) for line in data_file]
    with open(path+"/apt_trace_labels.txt", 'r') as label_file:
        labels = [line for line in label_file]
    return pd.DataFrame({'traces':data,'labels':labels})

In [36]:
def preprocess(df):
    """
    Preprocess the dataframe and returns the X and y for the training and test set
    :param df: pandas dataframe
    :return: X_train, y_train, X_test, y_test
    """
    df.drop_duplicates(subset=['traces'], inplace=True)
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(df['traces'][:int(0.7*len(df['traces']))])
    y_train = df['labels'][:int(0.7*len(df['labels']))]
    X_test = vectorizer.fit_transform(df['traces'][int(0.3*len(df['traces'])):])
    y_test = df['labels'][int(0.3*len(df['labels'])):]
    
    return X_train, y_train, X_test, y_test

In [37]:
preprocess(read_data(path))

(<4131x272 sparse matrix of type '<class 'numpy.int64'>'
 	with 177900 stored elements in Compressed Sparse Row format>,
 0          Dropper\n
 1            Worms\n
 2       Downloader\n
 3          Spyware\n
 4            Worms\n
             ...     
 4397       Dropper\n
 4398        Adware\n
 4399        Trojan\n
 4400      Backdoor\n
 4401        Trojan\n
 Name: labels, Length: 4131, dtype: object,
 <4132x271 sparse matrix of type '<class 'numpy.int64'>'
 	with 178259 stored elements in Compressed Sparse Row format>,
 1838       Spyware\n
 1839         Virus\n
 1840       Dropper\n
 1841      Backdoor\n
 1843       Spyware\n
             ...     
 6391      Backdoor\n
 6392       Spyware\n
 6393       Spyware\n
 6394    Downloader\n
 6395          Adware
 Name: labels, Length: 4132, dtype: object)