In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Complete exploration with 10000 samples. Just to test code, normalization and PCA included. 

# Data Exploration

## Data Loading 

Let's load a reduced version of the dataset to explore: **cat datasets/train.csv | head -n +100001 > datasets/train_reduced.csv**

In [None]:
total_data=10000

In [None]:
df = pd.read_csv('datasets/train_reduced.csv', index_col='MachineIdentifier')

## Data Description 

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

## Data Cleaning

In [None]:
# Let's remove each column with more than a 40% of nulls
th_perc = 0.3
clean_threshold = total_data*th_perc

In [None]:
df_for_clean = df.isnull()

In [None]:
columns_to_delete=[]
for c in df_for_clean.columns:
    nulls = df_for_clean[c].sum()
    if (nulls > clean_threshold):
        columns_to_delete.append(c)
        add=', REMOVE!!!'
    else:
        add=''
    print(c + ': ' + str(nulls) + add)

In [None]:
print('pre  filter N/A dimension' + str(df.shape))
df.drop(labels=columns_to_delete,axis=1,inplace=True)
print('post 1 filter N/A dimension' + str(df.shape))
df.dropna(inplace=True)
print('post 2 filter N/A dimension' + str(df.shape))

## Splitting label information

In [None]:
df_y = df['HasDetections']
df.drop(labels=['HasDetections'], axis=1, inplace=True)

## Categorical to Numerical 

We have 83 columns, 53 numbers and 30 categorical. Let's see the diversity of each of them: 

In [None]:
columns_categorical = df.select_dtypes(include=['object']).columns

In [None]:
total = 0
for c in columns_categorical:
    values = df[c].nunique()
    total += values
    print(str(c) + ': ' + str(values))
print('Total new vars: ' + str(total))

There are some of the categorical values that have a lot of values .... 

In [None]:
df_num=pd.get_dummies(data=df,columns=columns_categorical)

In [None]:
#df_num.to_csv('datasets/train_reduced_num.csv')

# Dimensionality Reduction with PCA (with previous normalization)

In [None]:
#df_num= pd.read_csv('datasets/train_reduced_num.csv')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
data_scaled = scaler.fit_transform(df_num.values)

In [None]:
pca = PCA()

In [None]:
reduced = pca.fit_transform(data_scaled)

In [None]:
 plt.scatter(reduced[:,0], reduced[:,1], c=df_y.values, alpha=1)

In [None]:
cumulative = []
last = 0
for v in pca.explained_variance_ratio_:
    cumulative.append(last + v)
    last = cumulative[-1]
plt.plot(cumulative)
#plt.xlim([0,7])

dimension can be reducced to k dimensions: 

In [None]:
keep_info=0.98
np_cumulative = np.array(cumulative)
k=(np_cumulative>keep_info).argmax()
print('k for 95% of information: ',k,'. compresion: ',1-(k/(reduced.shape[1])))

In [None]:
X_num_pca = reduced[:,0:k]

In [None]:
X_num_pca.shape

# Classification problem

In [None]:
import sys
sys.path.append('../../GitRepos/dsbase/src/main/')

from sklearn.model_selection import train_test_split
from ModelDSBase import ModelDSBaseWrapper

## Random Forest

In [None]:
from RandomForestClassificationDSBase import RandomForestClassificationDSBaseModel
from RandomForestClassificationDSBase import RandomForestClassificationDSBaseModelParamsToMap

In [None]:
params = RandomForestClassificationDSBaseModelParamsToMap(100,15)
rfc = ModelDSBaseWrapper('RF',data_scaled,df_y.values,[70,75,80,85,90,95,100],0.3,RandomForestClassificationDSBaseModel,params,splitter=train_test_split)

In [None]:
rfc.train()

In [None]:
lcrfc = rfc.getLearningCurves()

In [None]:
plt.plot(lcrfc[0,:],'b',lcrfc[1,:],'r')

In [None]:
rfc.getScore()

In [None]:
data_scaled[14:15,:].shape

In [None]:
rfc.model.model.predict_proba(data_scaled[14:45,:])

## Ada Boosting 

In [None]:
from AdaBoostClassificationDSBase import AdaBoostClassificationDSBaseModelParamsToMap
from AdaBoostClassificationDSBase import AdaBoostClassificationDSBaseModel

In [None]:
params = AdaBoostClassificationDSBaseModelParamsToMap(100,1.0)
abc = ModelDSBaseWrapper('AB',data_scaled,df_y.values,[70,75,80,85,90,95,100],0.3,AdaBoostClassificationDSBaseModel,params,splitter=train_test_split)

In [None]:
abc.train()

In [None]:
lcabc = abc.getLearningCurves()

In [None]:
plt.plot(lcabc[0,:],'b',lcabc[1,:],'r')

In [None]:
abc.getScore()

## DDN 

In [None]:
from DNNClassificationKerasDSBase import DNNClassificationKerasDSBaseParamsToMap
from DNNClassificationKerasDSBase import DNNClassificationKerasDSBaseModel

In [None]:
params = DNNClassificationKerasDSBaseParamsToMap(layers=[200,100,50,20,10,5], alpha=1e-2, beta1=0.9, beta2=0.999, epsilon=1e-8, batch_size=128, epochs=40)
dnnkc = ModelDSBaseWrapper('DNNKC',X_num_pca,df_y.values,[70,75,80,85,90,95,100],0.3,DNNClassificationKerasDSBaseModel,params,splitter=train_test_split)

In [None]:
dnnkc.train()

In [None]:
lcdnnkc=dnnkc.getLearningCurves()

In [None]:
plt.plot(lcdnnkc[0,:],'b',lcdnnkc[1,:],'r')

In [None]:
dnnkc.getScore()