In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
numeric_features = train.select_dtypes(include=[np.number])
corr = numeric_features.corr()
print (corr['IN_TREINEIRO'].sort_values(ascending=False)[:10], '\n')

IN_TREINEIRO       1.000000
TP_ST_CONCLUSAO    0.533983
TP_PRESENCA_CN     0.094692
TP_PRESENCA_CH     0.094692
TP_PRESENCA_LC     0.092454
TP_PRESENCA_MT     0.092454
Q034               0.055265
Q039               0.040348
Q005               0.033411
Q038               0.018855
Name: IN_TREINEIRO, dtype: float64 



In [4]:
features = ['TP_ST_CONCLUSAO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
                 'TP_PRESENCA_MT']
features_corr = ['IN_TREINEIRO', 'TP_ST_CONCLUSAO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
                 'TP_PRESENCA_MT']
df_train = train[features_corr]
df_test = test[features]

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13730 entries, 0 to 13729
Data columns (total 6 columns):
IN_TREINEIRO       13730 non-null int64
TP_ST_CONCLUSAO    13730 non-null int64
TP_PRESENCA_CN     13730 non-null int64
TP_PRESENCA_CH     13730 non-null int64
TP_PRESENCA_LC     13730 non-null int64
TP_PRESENCA_MT     13730 non-null int64
dtypes: int64(6)
memory usage: 643.7 KB


In [6]:
df_train.isnull().sum()

IN_TREINEIRO       0
TP_ST_CONCLUSAO    0
TP_PRESENCA_CN     0
TP_PRESENCA_CH     0
TP_PRESENCA_LC     0
TP_PRESENCA_MT     0
dtype: int64

In [7]:
df_test.isnull().sum()

TP_ST_CONCLUSAO    0
TP_PRESENCA_CN     0
TP_PRESENCA_CH     0
TP_PRESENCA_LC     0
TP_PRESENCA_MT     0
dtype: int64

In [8]:
y_train = df_train['IN_TREINEIRO']
df_train.drop('IN_TREINEIRO', axis=1, inplace=True)
x_train = df_train
x_test = df_test[features]
x_train = sc.fit_transform(x_train)  
x_test = sc.transform(x_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [9]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor( 
           criterion='mae', 
           max_depth=8,
           max_leaf_nodes=None,
           min_impurity_split=None,
           min_samples_leaf=1,
           min_samples_split=2,
           min_weight_fraction_leaf=0.0,
           n_estimators= 500,
           n_jobs=-1,
           random_state=0,
           verbose=0,
           warm_start=False
)   

In [10]:
regressor.fit(x_train, y_train) 

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=8,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                      oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [11]:
x_test = df_test[features] 
x_test = sc.transform(x_test)

In [12]:
y_pred_test = regressor.predict(x_test)
y_pred_train = regressor.predict(x_train)

In [13]:
print('MAE:', metrics.mean_absolute_error(y_train, y_pred_train).round(8)  )
print('MSE:', metrics.mean_squared_error(y_train, y_pred_train).round(8) )  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)).round(8))

MAE: 0.03812673
MSE: 0.03804982
RMSE: 0.19506363


In [14]:
y_pred_test

array([0., 0., 1., ..., 0., 0., 0.])

In [15]:
type(y_pred_test)

numpy.ndarray

In [16]:
submission = pd.DataFrame()
submission['NU_INSCRICAO'] = test.NU_INSCRICAO
submission['IN_TREINEIRO'] = y_pred_test.astype(int)

In [17]:
submission.sample(20)

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
3327,cfa924f175ad50bb97b84e4b8b7dae47a25bcf67,1
2572,2b1edbcc957facc1cad3200ca0f2316a97d8d785,1
1486,0fcbeb1cf6c02c765c54cea573d122d5bf0d1135,0
558,1e67d085613c713cf66c1fecfe54f75881aa3d4b,0
212,2bf2a617cdc4950c51963ddaadbb95bfd3a19f16,0
3593,2fe60793b6978e70e6fd324fa8fd19765891ed1b,0
4087,18a65ed8339f61fc620b1e1dc822542ef115c991,0
1961,da8ce21279c194e337fe37d0d954d8df5a68e6d9,0
3853,ff73208a0a0baf49e13dc275173f71a09bb3f91d,0
2246,b236870a57aa95fb0feafdff6ceb37a80daf26b3,0


In [18]:
submission.to_csv('answer.csv', index=False, header=True)