# Joins all the data that was processed separatedly 

## Libraries and Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.feature_selection import chi2

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [None]:
%%time
# Concatenate the training set
df = pd.concat([
               pd.read_pickle('../input/microft-malware-categorical-data/dfO_train_processed.pickle'),
               pd.read_pickle('../input/microft-malware-numerical-data/dfn_train_processed.pickle'),
               pd.read_csv('../input/microsoft-malware-prediction/train.csv', usecols=['HasDetections'], dtype='uint8')['HasDetections']
               ],
               axis=1
              ).dropna()

# Separate y and X
y = df['HasDetections']
df.drop('HasDetections', axis=1, inplace=True, errors='ignore')
df = df.astype('uint8')

# Feature selection

In [None]:
assert df.shape[0] ==  y.shape[0]

In [None]:
%%time
# Create the RFE object and rank each pixel
#clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
#rfe = RFE(estimator=clf, n_features_to_select=10, step=1)
#rfe.fit(df, y)

In [None]:
#df = df.iloc[:, rfe.get_support()]

In [None]:
# Feature Selection
# This is the result of the feature selection
columns = ['AppVersion_1807', 'AvSigVersion_1.273', 'SmartScreen_RA',
       'SmartScreen_ens', 'Census_OSInstallTypeName_UUPUpgrade',
       'Census_InternalBatteryNumberOfCharges', 'Wdft_IsGamer',
       'AVProductsInstalled_1', 'AVProductsInstalled_>2',
       'Census_TotalPhysicalRAM_2GB']

df = df[columns]

# Train the set

In [None]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(RandomForestClassifier(n_estimators=100, n_jobs=-1),
#                          df,
#                          y, 
#                          cv=4, 
#                          scoring='f1_macro')

In [None]:
# scores.mean()

In [None]:
%%time
# Fit the model
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1).fit(df, y)

# Fill test set missing data

In [None]:
df_test = pd.concat([pd.read_pickle('../input/microft-malware-categorical-data/dfO_test_processed.pickle'),
               pd.read_pickle('../input/microft-malware-numerical-data/dfn_test_processed.pickle')],
               axis=1)[df.columns]

# Download the Machine Identifiers
mi_test = pd.read_csv('../input/microsoft-malware-prediction/test.csv', usecols=['MachineIdentifier'])['MachineIdentifier']

# Download the sample to compare it
submission_example = pd.read_csv('../input/microsoft-malware-prediction/sample_submission.csv')

In [None]:
# There is missing data in the data set
df_test.isna().sum()[df_test.isna().sum() > 0].sort_values().plot.barh()
plt.show()

In [None]:
# Fill Wdfr_IsGamer
df_test['Wdft_IsGamer'].value_counts().plot.barh()
plt.show()

# Fiils with 0
df_test['Wdft_IsGamer'].fillna(0, inplace=True)

In [None]:
# Fill Wdfr_IsGamer
df_test['Census_InternalBatteryNumberOfCharges'].value_counts().hist(bins=100)
plt.show()

# Fill with mean 
df_test['Census_InternalBatteryNumberOfCharges'].fillna(df_test['Census_InternalBatteryNumberOfCharges'].mean(),
                                                       inplace=True)

# Make Prediction

In [None]:
pred = clf.predict(df_test)

# Create Submission

In [None]:
# Load the machine identifiers
sub = pd.DataFrame()
sub['MachineIdentifier'] = mi_test
sub['HasDetections'] = pred

In [None]:
# Check if the shape is the same
submission_example.shape, sub.shape

In [None]:
assert all(sub.iloc[:, 0] == submission_example.iloc[:, 0])

In [None]:
sub.to_csv('submission.csv', index=False)