In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_selection import chi2, SelectPercentile, SelectKBest

import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Load the data

In [None]:
%%time
df = pd.read_pickle('../input/microsoft-malware-first-data-analsys/dfO.pickle')
y = df['HasDetections']
df.drop('HasDetections', axis=1, inplace=True)
print(df.shape)

In [None]:
%%time
# Need to get the training set as well, in order to process it in the same way
# Test set does not have the label
df_test = pd.read_csv('../input/microsoft-malware-prediction/test.csv', usecols=df.columns, dtype="category")

# Visualizations

## Check if the label is balanced
Check if the number of labels is almost the same, or if there is one label much more frequent than the other

In [None]:
y.value_counts().plot.barh(title="Number of samples for each label", figsize=(5, 5))
plt.ylabel("Label")
plt.xlabel("Number in the Dataset")
plt.show()

## Missing Data

In [None]:
df.count().sort_values().plot.barh(figsize=(10, 10))
plt.show()

In [None]:
## Disregard features that have less than half of all the data missing
low_count = df.iloc[:, df.count().values < df.shape[0]/2].columns
print('Features droped:\n {}'.format(low_count))
df.drop(low_count, axis=1, inplace=True, errors='ignore')
print(df.shape)

# Drop from the test set as well
df_test.drop(low_count, axis=1, inplace=True, errors='ignore')

# Drop Features that have very high frequency values for the top value
Features that have the freq fielt too high, and also the Machine Identifier that is pretty much useless

In [None]:
df.describe().transpose().sort_values(by='freq', ascending=False)

In [None]:
high_freq = df.iloc[:, df.describe().transpose()['freq'].values > 8000000].columns
print('Features droped:\n {}'.format(high_freq))
df.drop(high_freq, axis=1, inplace=True ,errors='ignore')
df_test.drop(high_freq, axis=1, inplace=True, errors='ignore')

df.drop("MachineIdentifier", axis=1, inplace=True, errors='ignore') #Useless

# Drop from the test set as well
df_test.drop("MachineIdentifier", axis=1, inplace=True, errors='ignore')
    
print(df.shape)

In [None]:
# Making sure the training and testing set have always the same features
assert all(df.columns == df_test.columns)

# One Hot Encoding
Categories will be one Hot encoded. But some of the categories have too many unique values, so It is first necessary to reduce the numbers of the unique values in some of the features.
This implied going through the features one at a time

In [None]:
df.describe().transpose().sort_values(by='unique', ascending=False)

## AvSigVersion
8531 	unique values

In [None]:
# Only consider the major version
f = lambda x: ".".join(x.split('.')[:2])

df['AvSigVersion'] = df['AvSigVersion'].apply(f)
df_test['AvSigVersion'] = df_test['AvSigVersion'].apply(f)

df['AvSigVersion'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

In [None]:
other = set(df['AvSigVersion'].value_counts()[df['AvSigVersion'].value_counts() < 1000].index.values)
f = lambda x: "other" if x in other else x

df['AvSigVersion'] = df['AvSigVersion'].apply(f)
df_test['AvSigVersion'] = df_test['AvSigVersion'].apply(f)

df['AvSigVersion'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

## OsBuildLab
663 unique values

In [None]:
# Just use the first number
f = lambda x: ".".join(x.split('.')[:1])

df['OsBuildLab'] = df['OsBuildLab'].apply(f)
df_test['OsBuildLab'] = df_test['OsBuildLab'].apply(f)

df['OsBuildLab'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

### Group the names that have realy low values in the same category "other"

In [None]:
other = set(df['OsBuildLab'].value_counts()[df['OsBuildLab'].value_counts() < 90000].index.values)
f = lambda x: "other" if x in other else x

df['OsBuildLab'] = df['OsBuildLab'].apply(f)
df_test['OsBuildLab'] = df_test['OsBuildLab'].apply(f)

df['OsBuildLab'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

## Census_OSVersion
469 unique values

In [None]:
# Just use the first number again
f = lambda x: ".".join(x.split('.')[2:3])

df['Census_OSVersion'] = df['Census_OSVersion'].apply(f)
df_test['Census_OSVersion'] = df_test['Census_OSVersion'].apply(f)

df['Census_OSVersion'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

In [None]:
other = set(df['Census_OSVersion'].value_counts()[df['Census_OSVersion'].value_counts() < 5000].index.values)
f = lambda x: "other" if x in other else x

df['Census_OSVersion'] = df['Census_OSVersion'].apply(f)
df_test['Census_OSVersion'] = df_test['Census_OSVersion'].apply(f)

df['Census_OSVersion'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

## AppVersion

In [None]:
# Just use the first number again
f = lambda x: ".".join(x.split('.')[2:3])

df['AppVersion'] = df['AppVersion'].apply(f)
df_test['AppVersion'] = df_test['AppVersion'].apply(f)

other = set(df['AppVersion'].value_counts()[df['AppVersion'].value_counts() < 20000].index.values)
f = lambda x: "other" if x in other else x

df['AppVersion'] = df['AppVersion'].apply(f)
df_test['AppVersion'] = df_test['AppVersion'].apply(f)

df['AppVersion'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

## EngineVersion
1.1.15200.1

In [None]:
# Just use the first number again
f = lambda x: ".".join(x.split('.')[2:3])

df['EngineVersion'] = df['EngineVersion'].apply(f)
df_test['EngineVersion'] = df_test['EngineVersion'].apply(f)

other = set(df['EngineVersion'].value_counts()[df['EngineVersion'].value_counts() < 10000].index.values)
f = lambda x: "other" if x in other else x

df['EngineVersion'] = df['EngineVersion'].apply(f)
df_test['EngineVersion'] = df_test['EngineVersion'].apply(f)

df['EngineVersion'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

## Census_ChassisTypeName

In [None]:
df['Census_ChassisTypeName'].value_counts().plot.barh(figsize=(10, 10))
plt.show()

In [None]:
# Ignore this one
to_drop = ['Census_ChassisTypeName']

## Census_OSEdition

In [None]:
df['Census_OSEdition'].value_counts().plot.barh(figsize=(10, 10))
plt.show()

In [None]:
# Group into Professional, Core, Home, Edication and other
os_edition_dict={
    'Core': ['Core'],
    'Server': ['Server', 'Cloud'],
    'Enterprise': ['Enterprise'],
    'Education': ['Education'],
    'Professional': ['Pro' ],
    'Home': ['home']
}

def os_edition(x):
    for key, values in os_edition_dict.items():
        for ed in values:
            if ed.lower() in x.lower():
                return key
    return x

df['Census_OSEdition'] = df['Census_OSEdition'].apply(os_edition)
df_test['Census_OSEdition'] = df_test['Census_OSEdition'].apply(os_edition)

df['Census_OSEdition'].value_counts().plot.barh(figsize=(10, 5))
plt.show()

## Census_OSBranch

In [None]:
df['Census_OSBranch'].value_counts().plot.barh(figsize=(10, 7))
plt.show()

In [None]:
other = set(df['Census_OSBranch'].value_counts()[df['Census_OSBranch'].value_counts() < 2000].index.values)
f = lambda x: "other" if x in other else x

df['Census_OSBranch'] = df['Census_OSBranch'].apply(f)
df_test['Census_OSBranch'] = df_test['Census_OSBranch'].apply(f)

df['Census_OSBranch'].value_counts().plot.barh(figsize=(15, 5))
plt.show()

## Census_OSSkuName

In [None]:
df['Census_OSSkuName'].value_counts().plot.barh(figsize=(10, 7))
plt.show()

In [None]:
df['Census_OSSkuName'] = df['Census_OSSkuName'].apply(os_edition)
df_test['Census_OSSkuName'] = df_test['Census_OSSkuName'].apply(os_edition)

df['Census_OSSkuName'].apply(os_edition).value_counts().plot.barh(figsize=(10, 5))
plt.show()

## SmartScreen

In [None]:
df['SmartScreen'].value_counts().plot.barh(figsize=(10, 7))
plt.show()

In [None]:
ss_dict={
    'RA': ['admin'],
    'warn': ['warn'],
    'off': ['off'],
    'on': ['on'],
    'block': ['block' ],
    'prompt': ['prompt', 'promt'],
    'enabled': ['enabled'],
    'ens': ['notset']
}

def ss(x):   
    if x:
        for key, values in ss_dict.items():
            for ed in values:
                if (ed.lower() in x.lower()):
                    return key
            
df['SmartScreen'] = df['SmartScreen'].apply(ss)
df_test['SmartScreen'] = df_test['SmartScreen'].apply(ss)

df['SmartScreen'].apply(ss).value_counts().plot.barh(figsize=(10, 7))
plt.show()

In [None]:
assert all(df.columns == df_test.columns)

# Make Dummies

In [None]:
df.drop(to_drop, axis=1, inplace=True)
df_test.drop(to_drop, axis=1, inplace=True)

df.describe().transpose().sort_values(by='unique', ascending=False)

In [None]:
%%time
df = pd.get_dummies(df)
df_test = pd.get_dummies(df_test)

In [None]:
# Testing set has more columns
# This means that there was information in the testing set that was not in the training set
print("Training: {}".format(df.shape))
print("Testing: {}".format(df_test.shape))

In [None]:
# Drop features in the testing set that are not in the training set
df_test = df_test[[c for c in df.columns if c in df_test.columns]]
df_test.shape

# Remove low variance
Var[X]=p(1-p)
. Remove 80% of the feature p=.8

In [None]:
v = lambda p: p*(1-p)
p=0.95

# Remove Features with very low variance
to_drop = []
for c in df.columns:
    if df[c].var() < v(p):
        to_drop.append(c)
        
df.drop(to_drop, axis=1, inplace=True, errors='ignore')
df_test.drop(to_drop, axis=1, inplace=True, errors='ignore')

print("Training: {}".format(df.shape))
print("Testing: {}".format(df_test.shape))

# Chi2

In [None]:
skb = SelectKBest(chi2, k=20)

# Only the rows that would not have any Nans
ind = df.dropna().index

# Fit 
skb.fit(df.loc[ind, :], y[ind])

# Get the columns in each set
df = df.iloc[:, skb.get_support()]
df_test = df_test.iloc[:, skb.get_support()]

# Save Data

In [None]:
df.describe()

In [None]:
# Guarantee that they have the same columns
assert all(df.columns == df_test.columns)

In [None]:
df.to_pickle("dfO_train_processed.pickle")
df_test.to_pickle("dfO_test_processed.pickle")