In [1]:
# Name: dog-tor
# Date: March 17th, 2021
# 
# This program uses the panda library modules and will read and manipulate Microsoft Malware Prediction data (in csv format) in multiple ways.

# Import the required modules.
import pandas as pd # series and dataframe
import numpy as np

# Change the view to allow more columns and rows.
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000

# Load the first 1000 rows of the file.
dataset=pd.read_csv("train.csv", nrows = 1000) 

# Show the total number of missing values in all variables
print("The total number of missing values in all variables is:", dataset.isnull().sum().sum(),"\n\n")

print(dataset)





                 11.0             1  
644           0.0                  10.0             0  
645           0.0                   1.0             1  
646           0.0                  10.0             1  
647           0.0                  11.0             0  
648           0.0                  10.0             1  
649           0.0                  13.0             0  
650           0.0                  10.0             0  
651           0.0                   3.0             0  
652           1.0                  15.0             1  
653           NaN                   NaN             0  
654           0.0                  10.0             1  
655           0.0                   7.0             1  
656           0.0                  13.0             1  
657           0.0                   3.0             1  
658           0.0                   3.0             1  
659           0.0                  15.0             1  
660           0.0                   4.0             1  
661       

In [2]:
# Print summary statistics for numerical attributes, and print value counts for categorical attributes.
dataset["HasDetections"] = dataset["HasDetections"].astype("object")
for var_name in dataset.columns:
    print("-"*50)
    print(var_name)
    if dataset[var_name].dtype == np.object:
        print(dataset[var_name].value_counts())
    if dataset[var_name].dtype == np.int64 or dataset[var_name].dtype == np.float64:
        print(dataset[var_name].describe())



x86fre.rs1_release_sec.170327-1835                 1
14393.2214.x86fre.rs1_release_1.180402-1758                   1
10586.122.amd64fre.th2_release_inmarket.160222-1549           1
7601.23915.amd64fre.win7sp1_ldr.170913-0600                   1
9600.19125.amd64fre.winblue_ltsb.180812-0703                  1
9600.18505.amd64fre.winblue_ltsb.160930-0600                  1
14393.693.amd64fre.rs1_release.161220-1747                    1
10586.589.x86fre.th2_release.160906-1759                      1
9600.18589.amd64fre.winblue_ltsb.170204-0600                  1
14393.576.x86fre.rs1_release_inmarket.161208-2252             1
14393.321.x86fre.rs1_release_inmarket.161004-2338             1
10586.494.amd64fre.th2_release_sec.160630-1736                1
7601.18409.amd64fre.win7sp1_gdr.140303-2144                   1
9600.18969.x86fre.winblue_ltsb.180309-0600                    1
10586.839.amd64fre.th2_release.170303-1605                    1
14393.1198.x86fre.rs1_release_sec.170427-1353      

In [4]:
# Columns of dtype object are imputed with the most frequent value in the column. Columns of other types are imputed with mean of the column.
# Missing values imputation
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, dataset, y=None):

        self.fill = pd.Series([dataset[c].value_counts().index[0]
            if dataset[c].dtype == np.dtype('O') else dataset[c].mean() for c in dataset],
            index=dataset.columns)

        return self

    def transform(self, dataset, y=None):
        return dataset.fillna(self.fill)
dataset = DataFrameImputer().fit_transform(dataset)




In [5]:
# Show the total number of missing values again in all variables. Now, after missing value imputation, it should be 0
print("The total number of missing values in all variables is now:", dataset.isnull().sum().sum(),"\n\n")



The total number of missing values in all variables is now: 0 




In [37]:
# Do string encoding (i.e, encode string values to integers)
# encode string values to integers
from sklearn.preprocessing import LabelEncoder
# maintain a dict for string to integer mappings for each column
label_dict = dict()
for var_name in dataset.columns:
    # only map string values
    if dataset[var_name].dtype == np.object:
        le = LabelEncoder()
        # map the string values
        dataset[var_name] = le.fit_transform(dataset[var_name])
        # store the mapping in the dict
        label_dict[var_name] = dict()
        for cls, label in zip(le.classes_, le.transform(le.classes_)):
            label_dict[var_name][label] = cls

dataset.head(10)




Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0,1,21,19,208,0,7.0,0,1716.240741,53447.0,...,36144.0,0,0.0,0.0,0,0,0.0,0.0,10.0,0
1,1,1,14,12,38,0,7.0,0,1716.240741,53447.0,...,57858.0,0,0.0,0.0,0,0,0.0,0.0,8.0,0
2,2,1,21,19,141,0,7.0,0,1716.240741,53447.0,...,52682.0,0,0.0,0.0,0,0,0.0,0.0,3.0,0
3,3,1,21,19,172,0,7.0,0,1716.240741,53447.0,...,20050.0,0,0.0,0.0,0,0,0.0,0.0,3.0,1
4,4,1,21,19,145,0,7.0,0,1716.240741,53447.0,...,19844.0,0,0.0,0.0,0,0,0.0,0.0,1.0,1
5,5,1,21,19,114,0,7.0,0,1716.240741,53447.0,...,51039.0,0,0.0,0.0,0,0,0.0,0.0,15.0,1
6,6,1,21,19,284,0,7.0,0,1716.240741,43927.0,...,63175.0,1,0.0,0.0,0,0,0.0,0.0,10.0,1
7,7,1,21,19,146,0,7.0,0,1716.240741,53447.0,...,63122.0,0,0.0,0.0,0,0,0.0,0.0,15.0,0
8,8,1,22,19,535,0,7.0,0,1716.240741,53447.0,...,15510.0,0,0.0,0.0,0,0,0.0,0.0,15.0,0
9,9,1,21,19,301,0,7.0,0,1716.240741,46413.0,...,63555.0,1,0.0,0.0,0,0,0.0,1.0,8.0,1


In [7]:
# Second, modeling:
# Split the data into training and testing sets (80-20)
from sklearn.model_selection import train_test_split
attributes = [col for col in dataset.columns if col != "HasDetections"]
train_x, test_x, train_y, test_y = train_test_split(dataset[attributes], dataset["HasDetections"], test_size=0.2, random_state=123)

# convert numpy arrays to data frames
df_train_x = pd.DataFrame(train_x, columns=attributes)
df_test_x = pd.DataFrame(test_x, columns=attributes)
df_train_y = pd.DataFrame(train_y, columns=["HasDetections"])
df_test_y = pd.DataFrame(test_y, columns=["HasDetections"])



In [8]:
# Do feature selection using chi2 method.

# Feature selection must be performed using only training data
# compute the f value and p value of the chi-squared test between each attribute and the class
from sklearn.feature_extraction.text import *
from sklearn.feature_selection import *

f_val, p_val = chi2(df_train_x, df_train_y["HasDetections"]) 

# print the Chi-squared valus and p values
df_scores = pd.DataFrame(zip(attributes, f_val, p_val), columns=["feature", "chi2", "p"])
df_scores["chi2"] = df_scores["chi2"].round(2)
df_scores["p"] = df_scores["p"].round(3)
#print df_scores.sort_values("chi2", ascending=False)

# use features with p < 0.05
sel_ohe_cols = df_scores[df_scores["p"]<0.05]["feature"].values
print ("\nSelected features: %d" % len(sel_ohe_cols))
print (sel_ohe_cols)



ValueError: could not convert string to float: '0003c71b0742c1a9a6120c0881ebbb68'

In [9]:
# Building different predictive models like Decision Tree, Support Vector Machine (LinearSVC), and Naive Bayes (MultinomialNB) with features selected only. 
# Show the performance of the classifiers (f1, accuracy, precision, and recall)

print("Decision Tree with features selected only")
# Decision Tree
from sklearn import tree
clf = tree.DecisionTreeClassifier()
# train model
clf = clf.fit(df_train_x[sel_ohe_cols], train_y)
# make prediction
pred_y = clf.predict(df_test_x[sel_ohe_cols])#pred_y=[1,2,2,1,..........,2]
# evaluate the prediction results

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

Decision Tree with features selected only


NameError: name 'sel_ohe_cols' is not defined

In [41]:
print("Support Vector Machine with features selected only")
# Support Vector Machine
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=123456)
# train/build model
clf = clf.fit(df_train_x[sel_ohe_cols], train_y)
# make prediction
pred_y = clf.predict(df_test_x[sel_ohe_cols])
# evaluate the prediction results
print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))




Support Vector Machine with features selected only
f1:0.45669291338582674
accuracy:0.54
precision:0.3670886075949367
recall:0.6041666666666666


In [42]:
print("Naive Bayes with features selected only")
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
# train model
clf = clf.fit(df_train_x[sel_ohe_cols], train_y)
# make prediction
pred_y = clf.predict(df_test_x[sel_ohe_cols])
# evaluate the prediction results
print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

Naive Bayes with features selected only
f1:0.5683060109289617
accuracy:0.47333333333333333
precision:0.6582278481012658
recall:0.5


In [43]:
# Building different predictive models like Decision Tree, Support Vector Machine (LinearSVC) , and Naive Bayes (MultinomialNB) using all features. 
# Show the performance of the classifiers (f1, accuracy, precision, and recall) 


print("Decision Tree using all features")
# Decision Tree
from sklearn import tree
clf = tree.DecisionTreeClassifier()
# train model
clf = clf.fit(df_train_x, train_y)
# make prediction
pred_y = clf.predict(df_test_x)#pred_y=[1,2,2,1,..........,2]
# evaluate the prediction results

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

Decision Tree using all features
f1:0.5856697819314642
accuracy:0.5566666666666666
precision:0.5949367088607594
recall:0.5766871165644172


In [44]:
print("Support Vector Machine using all features")
# Support Vector Machine
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=123456)
# train/build model
clf = clf.fit(df_train_x, train_y)
# make prediction
pred_y = clf.predict(df_test_x)
# evaluate the prediction results
print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))




Support Vector Machine using all features
f1:0.45669291338582674
accuracy:0.54
precision:0.3670886075949367
recall:0.6041666666666666


In [45]:
print("Naive Bayes using all features")
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
# train model
clf = clf.fit(df_train_x, train_y)
# make prediction
pred_y = clf.predict(df_test_x)
# evaluate the prediction results
print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))

Naive Bayes using all features
f1:0.5683060109289617
accuracy:0.47333333333333333
precision:0.6582278481012658
recall:0.5
