##IMPORT LIBRARIES & MODELS

In [1]:
import random
import pandas as pd
import numpy as np
import sklearn.metrics 
import re

from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, GridSearchCV
from sklearn.feature_extraction.text import *
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.utils.multiclass import unique_labels

##PERSONAL FUNCTIONS

In [2]:
def extract_instr (instr):
  pattern = r"'[a-zA-z]{1,} "
  holder = re.findall(pattern,instr)

  for i in range(len(holder)):
    holder[i] = re.sub("\W","",holder[i])
  
  #print("This is temporal holder:")
  #print(holder)

  res = ""

  for i in range(len(holder)):
    if (i == len(holder) - 1):
      res = res + holder[i] 
    else :
      res = res + holder[i] + " "
  
  #print("This is result of extraction: "+ res)
  return res

def print_df (df, i):
  res = str(i) + " " + str(df.loc[i, "id"]) + " " + df.loc[i, "semantic"] + " " + df.loc[i, "lista_asm"]
  return res

def cmp_model_pred (model1, model2):
    c = 0

    cont = {
    "encryption vs math": 0,
    "encryption vs sort": 0,
    "math vs sort": 0,
    "encryption vs string": 0,
    "math vs string": 0,
    "sort vs string": 0,
    }

    res = []

    for i in range(len(model1)):
      if (model1[i] != model2[i]):
        c += 1
        if (((model1[i] == "encryption") and (model2[i] == "math")) or ((model1[i] == "math") and (model2[i] == "encryption"))):
          cont["encryption vs math"] += 1
        elif (((model1[i] == "encryption") and (model2[i] == "sort")) or ((model1[i] == "sort") and (model2[i] == "encryption"))):
          cont["encryption vs sort"] += 1
        elif (((model1[i] == "math") and (model2[i] == "sort")) or ((model1[i] == "sort") and (model2[i] == "math"))):
          cont["math vs sort"] += 1
        elif (((model1[i] == "encryption") and (model2[i] == "string")) or ((model1[i] == "string") and (model2[i] == "encryption"))):
          cont["encryption vs string"] += 1
        elif (((model1[i] == "math") and (model2[i] == "string")) or ((model1[i] == "string") and (model2[i] == "math"))):
          cont["math vs string"] += 1
        elif (((model1[i] == "sort") and (model2[i] == "string")) or ((model1[i] == "string") and (model2[i] == "sort"))):
          cont["sort vs string"] += 1

    res.append(str(c))
    res.append(cont)

    return res

##DATASET ACQUISITION

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!ls "/content/drive/MyDrive/ColabNotebooks/ML/HW1/noduplicatedataset.json"

/content/drive/MyDrive/ColabNotebooks/ML/HW1/noduplicatedataset.json


In [5]:
dataset_path = '/content/drive/MyDrive/ColabNotebooks/ML/HW1/noduplicatedataset.json'
testset_path = '/content/drive/MyDrive/ColabNotebooks/ML/HW1/blindtest.json'

In [6]:
df = pd.read_json(dataset_path, lines=True)

##PRE-PROCESSING

In [7]:
id = random.randrange(0,len(df.id))
print('%d %d %s %s' %(id,df.loc[id, "id"],df.loc[id, "semantic"],df.loc[id, "lista_asm"]))

2067 13346 string ['jmp qword ptr [rip + 0x21f75a]', 'jmp qword ptr [rip + 0x21f70a]', 'jmp qword ptr [rip + 0x21f702]', 'jmp qword ptr [rip + 0x21f6f2]', 'test rdi, rdi', 'je 0x174', 'push r15', 'push r14', 'push r13', 'push r12', 'push rbp', 'push rbx', 'sub rsp, 0x18', 'mov rbp, qword ptr [rdi + 8]', 'test rbp, rbp', 'je 0x127', 'mov r12d, dword ptr [rdi + 4]', 'test r12d, r12d', 'js 0x10c', 'mov ecx, dword ptr [rdi]', 'cmp r12d, ecx', 'setg dl', 'test ecx, ecx', 'setle al', 'or dl, al', 'jne 0xff', 'test esi, esi', 'jle 0xea', 'xor eax, eax', 'cmp ecx, esi', 'jg 0x98', 'cmp esi, 7', 'mov ebx, 8', 'jg 0xa0', 'xor eax, eax', 'cmp ecx, ebx', 'jge 0x80', 'lea eax, [rcx*8]', 'lea edx, [r12*8]', 'mov r13d, esi', 'mov r15, rdi', 'movsxd rsi, ebx', 'sub eax, ecx', 'cmp eax, edx', 'jl 0x54', 'mov rdi, rsi', 'mov qword ptr [rsp + 8], rsi', 'call 0xffffffffffff3806', 'test rax, rax', 'mov r14, rax', 'mov rsi, qword ptr [rsp + 8]', 'je 0x29', 'test r12d, r12d', 'jne 0x9c', 'mov rdi, rbp', 'cal

In [8]:
print(len(df))

6073


In [9]:
for i in range(len(df)): 
  df.lista_asm[i] = extract_instr(df.lista_asm[i])

print("Done!")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Done!


##VECTORIZATION OF "LISTA_ASM"

In [10]:
vectorizer_type = "tfid" # "hashing", "count" or "tfid"

if vectorizer_type == "hashing":
  vectorizer = HashingVectorizer(stop_words='english') # multivariate
elif vectorizer_type == "count":
  vectorizer = CountVectorizer(stop_words='english') # multinomial
elif vectorizer_type == "tfid":
  vectorizer = TfidfVectorizer(ngram_range=(1,5))

X_all = vectorizer.fit_transform(df.lista_asm)
y_all = df.semantic

print(X_all.shape)
print(y_all.shape)

(6073, 138510)
(6073,)


##DATASET SPLIT


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.333,random_state=117)

##MODELS CREATION

In [12]:
model = svm.SVC(kernel='linear', C=1, class_weight='balanced')

In [13]:
model2 = tree.DecisionTreeClassifier()

In [14]:
model21 = tree.DecisionTreeClassifier(max_depth=5, class_weight='balanced')

In [15]:
model3 = LogisticRegression()

In [16]:
model31 = SGDClassifier()

##MODELS FIT

In [17]:
model.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [18]:
model2.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [19]:
model21.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [20]:
model3.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
model31.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

##RUN PREDICTIONS

In [22]:
y_pred = model.predict(X_test)

In [23]:
y_pred2 = model2.predict(X_test)

In [24]:
y_pred21 = model21.predict(X_test)

In [25]:
y_pred3 = model3.predict(X_test)

In [26]:
y_pred31 = model3.predict(X_test)

##MODEL EVALUATION
*  SVM *(model 1)*
*  DecTree(std) *(model 2)*
*  DecTree(limited) *(model 2.1)*
*  LogisticRegression *(model 3)*
*  SGDClassifier *(model 3.1)*

In [27]:
acc = model.score(X_test, y_test)    
print("Accuracy of SVM Model: %.3f" %acc)

Accuracy of SVM Model: 0.991


In [28]:
acc2 = model2.score(X_test, y_test)    
print("Accuracy of DecTree(std) Model: %.3f" %acc2)
print()
acc21 = model21.score(X_test, y_test)    
print("Accuracy of DecTree(limited) Model: %.3f" %acc21)

Accuracy of DecTree(std) Model: 0.973

Accuracy of DecTree(limited) Model: 0.931


In [29]:
acc3 = model3.score(X_test, y_test)    
print("Accuracy of LogisticRegression Model: %.3f" %acc3)
print()
acc31 = model31.score(X_test, y_test)    
print("Accuracy of SGDClassifier Model: %.3f" %acc31)

Accuracy of LogisticRegression Model: 0.987

Accuracy of SGDClassifier Model: 0.993


In [30]:
print("Summary of of SVM Model")
print()
print(classification_report(y_test, y_pred, digits=3))

Summary of of SVM Model

              precision    recall  f1-score   support

  encryption      0.989     0.992     0.990       362
        math      0.996     0.991     0.994       799
        sort      0.949     0.971     0.960       172
      string      0.996     0.994     0.995       690

    accuracy                          0.991      2023
   macro avg      0.982     0.987     0.985      2023
weighted avg      0.991     0.991     0.991      2023



In [31]:
print("Summary of DecTree(std) Model")
print()
print(classification_report(y_test, y_pred2, digits=3))
print()
print("Summary of DecTree(limited) Model")
print()
print(classification_report(y_test, y_pred21, digits=3))

Summary of DecTree(std) Model

              precision    recall  f1-score   support

  encryption      0.980     0.959     0.969       362
        math      0.984     0.992     0.988       799
        sort      0.897     0.860     0.878       172
      string      0.976     0.987     0.981       690

    accuracy                          0.973      2023
   macro avg      0.959     0.950     0.954      2023
weighted avg      0.973     0.973     0.973      2023


Summary of DecTree(limited) Model

              precision    recall  f1-score   support

  encryption      0.929     0.906     0.917       362
        math      1.000     0.954     0.976       799
        sort      0.611     0.866     0.716       172
      string      0.970     0.933     0.951       690

    accuracy                          0.931      2023
   macro avg      0.877     0.915     0.890      2023
weighted avg      0.944     0.931     0.935      2023



In [32]:
print("Summary of LogisticRegression Model")
print()
print(classification_report(y_test, y_pred3, digits=3))
print()
print("Summary of SGDClassifier Model")
print()
print(classification_report(y_test, y_pred31, digits=3))

Summary of LogisticRegression Model

              precision    recall  f1-score   support

  encryption      0.992     0.983     0.988       362
        math      0.992     0.991     0.992       799
        sort      0.959     0.942     0.950       172
      string      0.986     0.996     0.991       690

    accuracy                          0.987      2023
   macro avg      0.982     0.978     0.980      2023
weighted avg      0.987     0.987     0.987      2023


Summary of SGDClassifier Model

              precision    recall  f1-score   support

  encryption      0.992     0.983     0.988       362
        math      0.992     0.991     0.992       799
        sort      0.959     0.942     0.950       172
      string      0.986     0.996     0.991       690

    accuracy                          0.987      2023
   macro avg      0.982     0.978     0.980      2023
weighted avg      0.987     0.987     0.987      2023



###CONFUSION MATRIX

In [33]:
print("SVM Model")
cm = confusion_matrix(y_test, y_pred, sample_weight=None)
print(cm)

SVM Model
[[359   2   1   0]
 [  0 792   5   2]
 [  3   1 167   1]
 [  1   0   3 686]]


In [34]:
print("DecTree(std) Model")
cm2 = confusion_matrix(y_test, y_pred2, sample_weight=None)
print(cm2)
print()
print("DecTree(limited) Model")
cm21 = confusion_matrix(y_test, y_pred21, sample_weight=None)
print(cm21)

DecTree(std) Model
[[347   4   7   4]
 [  0 793   5   1]
 [  6   6 148  12]
 [  1   3   5 681]]

DecTree(limited) Model
[[328   0  26   8]
 [  1 762  34   2]
 [ 13   0 149  10]
 [ 11   0  35 644]]


In [35]:
print("LogisticRegression Model")
cm3 = confusion_matrix(y_test, y_pred3, sample_weight=None)
print(cm3)
print()
print("SGDClassifier Model")
cm31 = confusion_matrix(y_test, y_pred31, sample_weight=None)
print(cm31)

LogisticRegression Model
[[356   3   2   1]
 [  0 792   4   3]
 [  3   1 162   6]
 [  0   2   1 687]]

SGDClassifier Model
[[356   3   2   1]
 [  0 792   4   3]
 [  3   1 162   6]
 [  0   2   1 687]]


###CROSS-VALIDATION SCORE

In [36]:
print("SVM Model")
cv = ShuffleSplit(n_splits=5, test_size=0.333, random_state=15)
scores = cross_val_score(model, X_all, y_all, cv=cv)
print(scores)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

SVM Model
[0.99011369 0.99209095 0.98863075 0.99307958 0.99060801]
Accuracy: 0.991 (+/- 0.00)


In [37]:
print("DecTree(std) Model")
cv2 = ShuffleSplit(n_splits=5, test_size=0.333, random_state=15)
scores2 = cross_val_score(model2, X_all, y_all, cv=cv)
print(scores2)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))

DecTree(std) Model
[0.97330697 0.96984676 0.96885813 0.97528423 0.9742956 ]
Accuracy: 0.972 (+/- 0.01)


In [38]:
print("DecTree(limited) Model")
cv21 = ShuffleSplit(n_splits=5, test_size=0.333, random_state=15)
scores21 = cross_val_score(model21, X_all, y_all, cv=cv)
print(scores21)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores21.mean(), scores21.std() * 2))

DecTree(limited) Model
[0.92436975 0.92881859 0.92684132 0.93030153 0.92634701]
Accuracy: 0.927 (+/- 0.00)


In [39]:
print("LogisticRegression Model")
cv3 = ShuffleSplit(n_splits=5, test_size=0.333, random_state=15)
scores3 = cross_val_score(model3, X_all, y_all, cv=cv)
print(scores3)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores3.mean(), scores3.std() * 2))

LogisticRegression Model
[0.98220465 0.9871478  0.98368759 0.98961938 0.98319328]
Accuracy: 0.985 (+/- 0.01)


In [40]:
print("SGDClassifier Model")
cv31 = ShuffleSplit(n_splits=5, test_size=0.333, random_state=15)
scores31 = cross_val_score(model31, X_all, y_all, cv=cv)
print(scores31)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores31.mean(), scores31.std() * 2))

SGDClassifier Model
[0.99406822 0.99505685 0.98912506 0.99456253 0.99209095]
Accuracy: 0.993 (+/- 0.00)


##PREDICTION ON BLIND TEST

In [41]:
df2 = pd.read_json('/content/drive/MyDrive/ColabNotebooks/ML/HW1/blindtest.json', lines=True)

In [42]:
for i in range(len(df2)): 
  df2.lista_asm[i] = extract_instr(df2.lista_asm[i])

print("Done!")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Done!


In [43]:
xnew = vectorizer.transform(df2.lista_asm)

In [44]:
ynew = model.predict(xnew)
ynew2 = model2.predict(xnew)
ynew21 = model21.predict(xnew)
ynew3 = model3.predict(xnew)
ynew31 = model31.predict(xnew)

In [None]:
with open('blindres.txt', 'w') as f:
    for item in ynew31:
        f.write("%s\n" % item)

##COMPARISON BETWEEN MODELS BLIND PREDICTION

*  SVM (model 1)
*  DecTree(std) (model 2)
*  DecTree(limited) (model 2.1)
*  LogisticRegression (model 3)
*  SGDClassifier (model 3.1)

In [45]:
cmp12 = cmp_model_pred(ynew, ynew2)

print("Total differences SVM vs DecTree(std): " + cmp12[0])
print(cmp12[1])

Total differences SVM vs DecTree(std): 42
{'encryption vs math': 5, 'encryption vs sort': 6, 'math vs sort': 16, 'encryption vs string': 1, 'math vs string': 1, 'sort vs string': 13}


In [46]:
cmp13 = cmp_model_pred(ynew, ynew3)

print("Total differences SVM vs LogisticRegression: " + cmp13[0])
print(cmp13[1])

Total differences SVM vs LogisticRegression: 32
{'encryption vs math': 3, 'encryption vs sort': 6, 'math vs sort': 12, 'encryption vs string': 2, 'math vs string': 2, 'sort vs string': 7}


In [47]:
cmp23 = cmp_model_pred(ynew2, ynew3)

print("Total differences DecTree(std) vs LogisticRegression: " + cmp23[0])
print(cmp23[1])

Total differences DecTree(std) vs LogisticRegression: 57
{'encryption vs math': 4, 'encryption vs sort': 12, 'math vs sort': 21, 'encryption vs string': 3, 'math vs string': 6, 'sort vs string': 11}


In [48]:
cmp22 = cmp_model_pred(ynew2, ynew21)

print("Total differences DecTree(std) vs DecTree(limited): " + cmp22[0])
print(cmp22[1])

Total differences DecTree(std) vs DecTree(limited): 104
{'encryption vs math': 6, 'encryption vs sort': 23, 'math vs sort': 22, 'encryption vs string': 16, 'math vs string': 5, 'sort vs string': 32}


In [49]:
cmp33 = cmp_model_pred(ynew3, ynew31)

print("Total differences LogisticRegression vs SGDClassifier: " + cmp33[0])
print(cmp33[1])

Total differences LogisticRegression vs SGDClassifier: 37
{'encryption vs math': 2, 'encryption vs sort': 6, 'math vs sort': 20, 'encryption vs string': 2, 'math vs string': 3, 'sort vs string': 4}


In [50]:
cmp131 = cmp_model_pred(ynew, ynew31)

print("Total differences SVM vs SGDClassifier: " + cmp131[0])
print(cmp131[1])

Total differences SVM vs SGDClassifier: 18
{'encryption vs math': 3, 'encryption vs sort': 0, 'math vs sort': 13, 'encryption vs string': 0, 'math vs string': 0, 'sort vs string': 2}


In [51]:
#WITH NGRAM == (1,5)
print("SVM vs DecTree(std)")
print(cmp12[1])
print()
print("SVM vs LogisticRegression")
print(cmp13[1])
print()
print("DecTree(std) vs LogisticRegression")
print(cmp23[1])
print()
print("DecTree(std) vs DecTree(limited)")
print(cmp22[1])
print()
print("LogisticRegression vs SGDClassifier")
print(cmp33[1])
print()
print("SVM vs SGDClassifier")
print(cmp131[1])

SVM vs DecTree(std)
{'encryption vs math': 5, 'encryption vs sort': 6, 'math vs sort': 16, 'encryption vs string': 1, 'math vs string': 1, 'sort vs string': 13}

SVM vs LogisticRegression
{'encryption vs math': 3, 'encryption vs sort': 6, 'math vs sort': 12, 'encryption vs string': 2, 'math vs string': 2, 'sort vs string': 7}

DecTree(std) vs LogisticRegression
{'encryption vs math': 4, 'encryption vs sort': 12, 'math vs sort': 21, 'encryption vs string': 3, 'math vs string': 6, 'sort vs string': 11}

DecTree(std) vs DecTree(limited)
{'encryption vs math': 6, 'encryption vs sort': 23, 'math vs sort': 22, 'encryption vs string': 16, 'math vs string': 5, 'sort vs string': 32}

LogisticRegression vs SGDClassifier
{'encryption vs math': 2, 'encryption vs sort': 6, 'math vs sort': 20, 'encryption vs string': 2, 'math vs string': 3, 'sort vs string': 4}

SVM vs SGDClassifier
{'encryption vs math': 3, 'encryption vs sort': 0, 'math vs sort': 13, 'encryption vs string': 0, 'math vs string': 0,

###RECAP WITH NGRAM == 1
```
SVM vs DecTree(std)
{'encryption vs math': 5, 'encryption vs sort': 16, 'math vs sort': 21, 'encryption vs string': 0, 'math vs string': 1, 'sort vs string': 29}

SVM vs LogisticRegression
{'encryption vs math': 4, 'encryption vs sort': 9, 'math vs sort': 3, 'encryption vs string': 1, 'math vs string': 0, 'sort vs string': 61}

DecTree(std) vs LogisticRegression
{'encryption vs math': 2, 'encryption vs sort': 16, 'math vs sort': 13, 'encryption vs string': 1, 'math vs string': 11, 'sort vs string': 56}

DecTree(std) vs DecTree(limited)
{'encryption vs math': 10, 'encryption vs sort': 21, 'math vs sort': 20, 'encryption vs string': 16, 'math vs string': 4, 'sort vs string': 56}

LogisticRegression vs SGDClassifier
{'encryption vs math': 3, 'encryption vs sort': 5, 'math vs sort': 4, 'encryption vs string': 2, 'math vs string': 0, 'sort vs string': 31}

SVM vs SGDClassifier
{'encryption vs math': 1, 'encryption vs sort': 9, 'math vs sort': 3, 'encryption vs string': 2, 'math vs string': 0, 'sort vs string': 45}
```





###RECAP WITH NGRAM == (1,5)
```
SVM vs DecTree(std)
{'encryption vs math': 5, 'encryption vs sort': 6, 'math vs sort': 16, 'encryption vs string': 1, 'math vs string': 1, 'sort vs string': 13}

SVM vs LogisticRegression
{'encryption vs math': 3, 'encryption vs sort': 6, 'math vs sort': 12, 'encryption vs string': 2, 'math vs string': 2, 'sort vs string': 7}

DecTree(std) vs LogisticRegression
{'encryption vs math': 4, 'encryption vs sort': 12, 'math vs sort': 21, 'encryption vs string': 3, 'math vs string': 6, 'sort vs string': 11}

DecTree(std) vs DecTree(limited)
{'encryption vs math': 6, 'encryption vs sort': 23, 'math vs sort': 22, 'encryption vs string': 16, 'math vs string': 5, 'sort vs string': 32}

LogisticRegression vs SGDClassifier
{'encryption vs math': 2, 'encryption vs sort': 6, 'math vs sort': 20, 'encryption vs string': 2, 'math vs string': 3, 'sort vs string': 4}

SVM vs SGDClassifier
{'encryption vs math': 3, 'encryption vs sort': 0, 'math vs sort': 13, 'encryption vs string': 0, 'math vs string': 0, 'sort vs string': 2}
```



