# Laboratorio 4
## Clasificación de malware
Detección de malware utilizando métodos de ML
> #### Gerardo Méndez 18239

In [164]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [165]:
## cargamos el set de datos
df = pd.read_csv('datasets/virus_sample.csv')

### Parte 1 - Ingeniería de Características

#### Exploración de datos

In [166]:
## mostramos los datos
df.head(10)

Unnamed: 0,file,api,class
0,7ff49f2f0912352416b05c010f35f402cc79feed,"IntersectRect,GetCurrentProcess,GetVersion",Virus
1,50cc6c99ec285d0db45dde07d8fdc18d9098c5b6,"GetCaretBlinkTime,CountClipboardFormats,GetCon...",Virus
2,f77c6bd4aebacd1a01d02e0cb20642ebf2d32929,"VarR8Pow,GetClipboardViewer,GetInputDesktop,Ge...",Virus
3,349c367c5b88fbb6cafae5d7109588d7250e16b5,"SetTraceCallback,CopyAcceleratorTableW,GetProc...",Virus
4,021f4aa86b520e1d606ab26699c35546bcd00c27,"SHLoadNonloadedIconOverlayIdentifiers,VarUI8Fr...",Virus
5,124491349667be19c61a5f938f139eb326b9223b,"NtReplyWaitReceivePortEx,NtSetInformationJobOb...",Virus
6,6aea319c23473d1b837972a064bc20d79404c859,"SHGetNewLinkInfoA,GetSystemDefaultUILanguage,L...",Virus
7,7682fa3c7f4ecbd9df428630778b6662ea172e71,"CryptGenRandom,GetDoubleClickTime,GetDialogBas...",Virus
8,1c0d53d5ed17b9f68f6c1c2fc034e766380a1665,"GetVersion,GetTapePosition,GetLastError,SetMen...",Virus
9,2d25cdbd7b6ea9a03b270525d13eda302be3edc1,"VarUI8FromStr,SetVarConversionLocaleSetting,Nt...",Virus


In [167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9795 entries, 0 to 9794
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   file    9795 non-null   object
 1   api     9795 non-null   object
 2   class   9795 non-null   object
dtypes: object(3)
memory usage: 229.7+ KB


In [168]:
df['class'].value_counts()

Trojan        6153
Virus         2367
Backdoor       447
Worms          441
Adware         222
Agent          102
Downloader      31
Spyware         11
Ransomware      10
Riskware         4
Dropper          4
Crypt            2
Keylogger        1
Name: class, dtype: int64

In [169]:
df.drop(df.loc[df['class'] == 'Downloader'].index, inplace=True)
df.drop(df.loc[df['class'] == 'Spyware'].index, inplace=True)
df.drop(df.loc[df['class'] == 'Ransomware'].index, inplace=True)
df.drop(df.loc[df['class'] == 'Riskware'].index, inplace=True)
df.drop(df.loc[df['class'] == 'Dropper'].index, inplace=True)
df.drop(df.loc[df['class'] == 'Crypt'].index, inplace=True)
df.drop(df.loc[df['class'] == 'Keylogger'].index, inplace=True)

In [170]:
df['class'].value_counts()

Trojan      6153
Virus       2367
Backdoor     447
Worms        441
Adware       222
Agent        102
Name: class, dtype: int64

In [171]:
mapping = {
    "Trojan": 0,
    "Virus": 1,
    "Backdoor": 2,
    "Worms": 3,
    "Adware": 4,
    "Agent": 5
}

In [172]:
df['class'] = df.apply(lambda row: mapping[row['class']], axis=1)

In [173]:
df['class'].value_counts()

0    6153
1    2367
2     447
3     441
4     222
5     102
Name: class, dtype: int64

In [174]:
df

Unnamed: 0,file,api,class
0,7ff49f2f0912352416b05c010f35f402cc79feed,"IntersectRect,GetCurrentProcess,GetVersion",1
1,50cc6c99ec285d0db45dde07d8fdc18d9098c5b6,"GetCaretBlinkTime,CountClipboardFormats,GetCon...",1
2,f77c6bd4aebacd1a01d02e0cb20642ebf2d32929,"VarR8Pow,GetClipboardViewer,GetInputDesktop,Ge...",1
3,349c367c5b88fbb6cafae5d7109588d7250e16b5,"SetTraceCallback,CopyAcceleratorTableW,GetProc...",1
4,021f4aa86b520e1d606ab26699c35546bcd00c27,"SHLoadNonloadedIconOverlayIdentifiers,VarUI8Fr...",1
...,...,...,...
9790,3b91b43ad8a25a8e9e52f938473f5bb05fb4d530ff4b22...,"NtResetWriteWatch,GetClipboardViewer,GetConsol...",0
9791,54315d591d2855fc794ba5f61012efdd92b489576e4aa5...,"RtlpNtEnumerateSubKey,NtFlushWriteBuffer,ReadC...",0
9792,dcb7507fa10537d2e3da2ea22ec6346e02a0926ad64d76...,"ZwNotifyChangeDirectoryFile,GetCursor,GetConso...",0
9793,cf441d0a2de25c6207fc782e190ce5302171d5e28ce41b...,"ILSaveToStream,OleQueryLinkFromData,VarUdateFr...",0


#### Preprocesamiento

In [175]:
from sklearn.preprocessing import MultiLabelBinarizer

In [176]:
wnl = WordNetLemmatizer()

def preprocessing(body):
    lower = body.lower().strip()
    
    ## generamos los tokens sin stop words
    tokens = nltk.word_tokenize(lower)
    
    ## lematizar palabras
    lemma = [wnl.lemmatize(token) for token in tokens]
    
    return lemma

In [177]:
df['tokens'] = df.apply(lambda row: preprocessing(row['api']), axis=1)

In [178]:
df

Unnamed: 0,file,api,class,tokens
0,7ff49f2f0912352416b05c010f35f402cc79feed,"IntersectRect,GetCurrentProcess,GetVersion",1,"[intersectrect, ,, getcurrentprocess, ,, getve..."
1,50cc6c99ec285d0db45dde07d8fdc18d9098c5b6,"GetCaretBlinkTime,CountClipboardFormats,GetCon...",1,"[getcaretblinktime, ,, countclipboardformats, ..."
2,f77c6bd4aebacd1a01d02e0cb20642ebf2d32929,"VarR8Pow,GetClipboardViewer,GetInputDesktop,Ge...",1,"[varr8pow, ,, getclipboardviewer, ,, getinputd..."
3,349c367c5b88fbb6cafae5d7109588d7250e16b5,"SetTraceCallback,CopyAcceleratorTableW,GetProc...",1,"[settracecallback, ,, copyacceleratortablew, ,..."
4,021f4aa86b520e1d606ab26699c35546bcd00c27,"SHLoadNonloadedIconOverlayIdentifiers,VarUI8Fr...",1,"[shloadnonloadediconoverlayidentifiers, ,, var..."
...,...,...,...,...
9790,3b91b43ad8a25a8e9e52f938473f5bb05fb4d530ff4b22...,"NtResetWriteWatch,GetClipboardViewer,GetConsol...",0,"[ntresetwritewatch, ,, getclipboardviewer, ,, ..."
9791,54315d591d2855fc794ba5f61012efdd92b489576e4aa5...,"RtlpNtEnumerateSubKey,NtFlushWriteBuffer,ReadC...",0,"[rtlpntenumeratesubkey, ,, ntflushwritebuffer,..."
9792,dcb7507fa10537d2e3da2ea22ec6346e02a0926ad64d76...,"ZwNotifyChangeDirectoryFile,GetCursor,GetConso...",0,"[zwnotifychangedirectoryfile, ,, getcursor, ,,..."
9793,cf441d0a2de25c6207fc782e190ce5302171d5e28ce41b...,"ILSaveToStream,OleQueryLinkFromData,VarUdateFr...",0,"[ilsavetostream, ,, olequerylinkfromdata, ,, v..."


In [179]:
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df['tokens']),columns=mlb.classes_))

df

Unnamed: 0,file,api,class,tokens,$,",",0,0_container_base12,0_locinfo,0_lockit,...,zwunloadkeyex,zwunlockfile,zwunmapviewofsection,zwvdmcontrol,zwwaitfordebugevent,zwwaitformultipleobjects,zwwaitforsingleobject,zwwritefile,zwwritefilegather,zwwriterequestdata
0,7ff49f2f0912352416b05c010f35f402cc79feed,"IntersectRect,GetCurrentProcess,GetVersion",1,"[intersectrect, ,, getcurrentprocess, ,, getve...",0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50cc6c99ec285d0db45dde07d8fdc18d9098c5b6,"GetCaretBlinkTime,CountClipboardFormats,GetCon...",1,"[getcaretblinktime, ,, countclipboardformats, ...",0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f77c6bd4aebacd1a01d02e0cb20642ebf2d32929,"VarR8Pow,GetClipboardViewer,GetInputDesktop,Ge...",1,"[varr8pow, ,, getclipboardviewer, ,, getinputd...",0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,349c367c5b88fbb6cafae5d7109588d7250e16b5,"SetTraceCallback,CopyAcceleratorTableW,GetProc...",1,"[settracecallback, ,, copyacceleratortablew, ,...",0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,021f4aa86b520e1d606ab26699c35546bcd00c27,"SHLoadNonloadedIconOverlayIdentifiers,VarUI8Fr...",1,"[shloadnonloadediconoverlayidentifiers, ,, var...",0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9790,3b91b43ad8a25a8e9e52f938473f5bb05fb4d530ff4b22...,"NtResetWriteWatch,GetClipboardViewer,GetConsol...",0,"[ntresetwritewatch, ,, getclipboardviewer, ,, ...",,,,,,,...,,,,,,,,,,
9791,54315d591d2855fc794ba5f61012efdd92b489576e4aa5...,"RtlpNtEnumerateSubKey,NtFlushWriteBuffer,ReadC...",0,"[rtlpntenumeratesubkey, ,, ntflushwritebuffer,...",,,,,,,...,,,,,,,,,,
9792,dcb7507fa10537d2e3da2ea22ec6346e02a0926ad64d76...,"ZwNotifyChangeDirectoryFile,GetCursor,GetConso...",0,"[zwnotifychangedirectoryfile, ,, getcursor, ,,...",,,,,,,...,,,,,,,,,,
9793,cf441d0a2de25c6207fc782e190ce5302171d5e28ce41b...,"ILSaveToStream,OleQueryLinkFromData,VarUdateFr...",0,"[ilsavetostream, ,, olequerylinkfromdata, ,, v...",,,,,,,...,,,,,,,,,,


#### Selección de características

In [180]:
label = 'class'

In [181]:
columns = ['file', 'api', 'class', 'tokens']

In [182]:
df = df.dropna()

In [183]:
X = df.drop(columns=columns, axis=1)

In [184]:
y = df[label]

In [185]:
y

0       1
1       1
2       1
3       1
4       1
       ..
9727    1
9728    1
9729    1
9730    1
9731    1
Name: class, Length: 9669, dtype: int64

#### División de los datos

In [199]:
import sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [187]:
test_ratio = 0.3

In [188]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio)

### Modelos
*K-Fold referenciado de: https://machinelearningmastery.com/how-to-configure-k-fold-cross-validation/*

**Modelo 1**

In [189]:
from sklearn.neighbors import KNeighborsClassifier

In [190]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [191]:
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      1821
           1       0.94      0.95      0.94       675
           2       0.80      0.73      0.76       155
           3       0.75      0.44      0.55       141
           4       0.68      0.56      0.61        72
           5       0.50      0.03      0.05        37

    accuracy                           0.91      2901
   macro avg       0.76      0.61      0.64      2901
weighted avg       0.90      0.91      0.90      2901



In [200]:
kf = KFold(n_splits=10)

In [201]:
results = cross_val_score(knn, X, y, scoring='accuracy', cv=kf, n_jobs=-1)

In [202]:
results

array([0.94932782, 0.86452947, 0.94725957, 0.73733195, 0.98138573,
       0.97931748, 0.92140641, 0.9524302 , 0.91520165, 0.59834369])

In [204]:
print('Accuracy: %.3f (%.3f)' % (np.mean(results), np.std(results)))

Accuracy: 0.885 (0.117)


#### Modelo 2

In [192]:
from sklearn import tree

depth = 3
sample_split = 40

In [193]:
tre = tree.DecisionTreeClassifier(
    max_depth = depth,
    min_samples_split = sample_split
)

tre.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, min_samples_split=40)

In [194]:
tre_pred = tre.predict(X_test)
print(classification_report(y_test, tre_pred))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1821
           1       0.84      0.94      0.89       675
           2       0.59      0.74      0.65       155
           3       0.66      0.38      0.48       141
           4       0.76      0.39      0.51        72
           5       0.00      0.00      0.00        37

    accuracy                           0.85      2901
   macro avg       0.62      0.56      0.57      2901
weighted avg       0.84      0.85      0.84      2901



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [205]:
kf = KFold(n_splits=10)

In [206]:
results = cross_val_score(tre, X, y, scoring='accuracy', cv=kf, n_jobs=-1)

In [207]:
results

array([0.89762151, 0.85832472, 0.94002068, 0.65356774, 0.90382627,
       0.88831437, 0.91623578, 0.89348501, 0.83143744, 0.40269151])

In [208]:
print('Accuracy: %.3f (%.3f)' % (np.mean(results), np.std(results)))

Accuracy: 0.819 (0.158)


Para ambos modelos, las métricas de recall, precision y f1-score dependen del tipo de malware que sean. Logran detectar mejor los tipos con mayor cantidad de muestras en el set de datos.

### Conclusion

¿Se lograron obtener mejores métricas que las obtenidas en el 
artículo para la clasificación de malware?
> No se lograron obtener mejores métricas, los valores que se presentan en el artículo siguen siendo más altos.