In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  

In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
a = train['Unnamed']

In [6]:
train.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,1,2,3,4,5
NU_INSCRICAO,ed50e8aaa58e7a806c337585efee9ca41f1eb1ad,2c3acac4b33ec2b195d77e7c04a2d75727fad723,f4545f8ccb9ff5c8aad7d32951b3f251a26e6568,3d6ec248fef899c414e77f82d5c6d2bffbeaf7fe,bf896ac8d3ecadd6dba1dfbf50110afcbf5d3268
NU_ANO,2016,2016,2016,2016,2016
CO_MUNICIPIO_RESIDENCIA,4314902,2304707,2304400,3304557,1302603
NO_MUNICIPIO_RESIDENCIA,Porto Alegre,Granja,Fortaleza,Rio de Janeiro,Manaus
CO_UF_RESIDENCIA,43,23,23,33,13
SG_UF_RESIDENCIA,RS,CE,CE,RJ,AM
NU_IDADE,24,17,21,25,28
TP_SEXO,M,F,F,F,M
TP_ESTADO_CIVIL,0,0,0,0,0


In [3]:
numeric_features = train.select_dtypes(include=[np.number])
corr = numeric_features.corr()
print (corr['IN_TREINEIRO'].sort_values(ascending=False)[:10], '\n')

IN_TREINEIRO       1.000000
TP_ST_CONCLUSAO    0.533983
TP_PRESENCA_CN     0.094692
TP_PRESENCA_CH     0.094692
TP_PRESENCA_LC     0.092454
TP_PRESENCA_MT     0.092454
Q034               0.055265
Q039               0.040348
Q005               0.033411
Q038               0.018855
Name: IN_TREINEIRO, dtype: float64 



In [4]:
features = ['TP_ST_CONCLUSAO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
                 'TP_PRESENCA_MT']
features_corr = ['IN_TREINEIRO', 'TP_ST_CONCLUSAO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
                 'TP_PRESENCA_MT']
df_train = train[features_corr]
df_test = test[features]

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13730 entries, 0 to 13729
Data columns (total 6 columns):
IN_TREINEIRO       13730 non-null int64
TP_ST_CONCLUSAO    13730 non-null int64
TP_PRESENCA_CN     13730 non-null int64
TP_PRESENCA_CH     13730 non-null int64
TP_PRESENCA_LC     13730 non-null int64
TP_PRESENCA_MT     13730 non-null int64
dtypes: int64(6)
memory usage: 643.7 KB


In [6]:
df_train.isnull().sum()

IN_TREINEIRO       0
TP_ST_CONCLUSAO    0
TP_PRESENCA_CN     0
TP_PRESENCA_CH     0
TP_PRESENCA_LC     0
TP_PRESENCA_MT     0
dtype: int64

In [7]:
df_test.isnull().sum()

TP_ST_CONCLUSAO    0
TP_PRESENCA_CN     0
TP_PRESENCA_CH     0
TP_PRESENCA_LC     0
TP_PRESENCA_MT     0
dtype: int64

In [8]:
y_train = df_train['IN_TREINEIRO']
df_train.drop('IN_TREINEIRO', axis=1, inplace=True)
x_train = df_train
x_test = df_test[features]
x_train = sc.fit_transform(x_train)  
x_test = sc.transform(x_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [9]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor( 
           criterion='mae', 
           max_depth=8,
           max_leaf_nodes=None,
           min_impurity_split=None,
           min_samples_leaf=1,
           min_samples_split=2,
           min_weight_fraction_leaf=0.0,
           n_estimators= 500,
           n_jobs=-1,
           random_state=0,
           verbose=0,
           warm_start=False
)   

In [10]:
regressor.fit(x_train, y_train) 

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=8,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                      oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [11]:
x_test = df_test[features] 
x_test = sc.transform(x_test)

In [12]:
y_pred_test = regressor.predict(x_test)
y_pred_train = regressor.predict(x_train)

In [13]:
print('MAE:', metrics.mean_absolute_error(y_train, y_pred_train).round(8)  )
print('MSE:', metrics.mean_squared_error(y_train, y_pred_train).round(8) )  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)).round(8))

MAE: 0.03812673
MSE: 0.03804982
RMSE: 0.19506363


In [14]:
y_pred_test

array([0., 0., 1., ..., 0., 0., 0.])

In [15]:
type(y_pred_test)

numpy.ndarray

In [16]:
submission = pd.DataFrame()
submission['NU_INSCRICAO'] = test.NU_INSCRICAO
submission['IN_TREINEIRO'] = y_pred_test.astype(int)

In [17]:
submission.sample(20)

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
374,73de61ef5ce55a99c860b90cb6ae40cf5204c74a,0
1575,5580681512f1b8ba033fe61b6bdd4741872cade5,0
3081,653828d5ec6455de87fb37922c90d7a38e7a1793,0
3680,3b4b5258e3d42b2c871fceb4b8d1402463d95a7b,0
992,33e3233640c58dd50233b1f6169af36a2cfb7c96,0
3352,64afa113b956ee1085921c7f57fa8d68f9f7367b,0
403,c296cb821061a1c7c717f1f79adaff624c62493f,0
1732,47dc816b33071b369e044dae469016d2d3d04167,0
4512,6ae44e532f63d0009e8011c564bd113a1e15ae0f,0
3894,a3754ee140aaeff6a7f29f27ead573b6e33fb414,0


In [18]:
submission.to_csv('answer.csv', index=False, header=True)