In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
import eli5
from eli5.sklearn import PermutationImportance

np.random.seed(42)

dir_path = '../2. Prepared Data/'
save_path = dir_path
file_name = ['20180924_merged_train_sampled_simple.csv','20180924_merged_test_sampled_simple.csv']



## <u>Comparando Modelos</u>:

In [2]:
#UNINDO DATASETS ORIGINALMENTE SEPARADOS
data = pd.read_csv(dir_path+file_name[0]).append(pd.read_csv(dir_path+file_name[1]))

In [3]:
def undersample(data,target):
    min_class = pd.DataFrame(target.value_counts()).reset_index().iloc[-1]['index']
    no_min = len(data[data['TIME'] == min_class])
    under_sample_indices = data[data['TIME'] == min_class].index
    classes = [min_class]
    for c in np.unique(target):
        if c not in classes:
            idx_c = data[data['TIME'] == c].index
            random_indices = np.random.choice(idx_c, no_min, replace=True)
            under_sample_indices = np.concatenate([under_sample_indices,random_indices])
            classes.append(c)
    under_sample = data.iloc[under_sample_indices]
    return under_sample

undersampled_data = undersample(data,data.TIME)
undersampled_data.shape
undersampled_data.to_csv(save_path+'20180926_undersampled_train_plus_test.csv')

In [4]:
#MODELO PRA DADOS RESAMPLEADOS
us_data_X = undersampled_data.drop(['KEY','TIME'],axis=1)
us_data_y = undersampled_data.TIME
X_train,X_test,y_train,y_test = train_test_split(us_data_X,us_data_y,random_state=42,test_size=0.2)

In [5]:
svc = LinearSVC().fit(X_train,y_train)
print('svc done')
knn = KNeighborsClassifier(n_neighbors=10).fit(X_train,y_train)
print('knn done')
tree = DecisionTreeClassifier().fit(X_train,y_train)

svc done
knn done


In [6]:
perm_svc = PermutationImportance(svc).fit(X_test,y_test)
eli5.show_weights(perm_svc,feature_names=list(X_test.columns),top=None)

Weight,Feature
0.0701  ± 0.0089,sao-paulo
0.0684  ± 0.0085,flamengo
0.0316  ± 0.0055,fluminense
0.0302  ± 0.0049,ponte-preta
0.0262  ± 0.0145,corinthians
0.0228  ± 0.0097,botafogo
0.0217  ± 0.0061,atletico-pr
0.0214  ± 0.0070,vitoria
0.0211  ± 0.0028,figueirense
0.0208  ± 0.0050,bahia


In [7]:
perm_knn = PermutationImportance(knn).fit(X_test,y_test)
eli5.show_weights(perm_knn,feature_names=list(X_test.columns),top=None)

Weight,Feature
0.0963  ± 0.0098,flamengo
0.0806  ± 0.0230,palmeiras
0.0627  ± 0.0194,sao-paulo
0.0610  ± 0.0127,corinthians
0.0598  ± 0.0204,santos
0.0527  ± 0.0086,internacional
0.0476  ± 0.0126,cruzeiro
0.0430  ± 0.0066,gremio
0.0385  ± 0.0108,vasco
0.0291  ± 0.0082,fluminense


In [8]:
perm_tree = PermutationImportance(tree).fit(X_test,y_test)
eli5.show_weights(perm_tree,feature_names=list(X_test.columns),top=None)

Weight,Feature
0.0823  ± 0.0116,ESTADO_Sao Paulo
0.0652  ± 0.0066,ESTADO_Rio Grande do Sul
0.0624  ± 0.0190,flamengo
0.0519  ± 0.0053,corinthians
0.0425  ± 0.0042,ESTADO_Pernambuco
0.0379  ± 0.0126,palmeiras
0.0368  ± 0.0101,sao-paulo
0.0353  ± 0.0046,ESTADO_Minas Gerais
0.0333  ± 0.0098,ESTADO_Bahia
0.0285  ± 0.0031,ESTADO_Goias


In [9]:
pred_svc = perm_svc.predict(X_test)
pred_knn = perm_knn.predict(X_test)
pred_tree = perm_tree.predict(X_test)

In [10]:
print('SVC: \n'+classification_report(y_test,pred_svc))
print('KNN \n'+classification_report(y_test,pred_knn))
print('DTREE: \n'+classification_report(y_test,pred_tree))

SVC: 
                 precision    recall  f1-score   support

            abc       0.25      0.15      0.19        20
     america-rn       0.00      0.00      0.00        20
    atletico-mg       0.18      0.30      0.22        27
    atletico-pr       0.92      0.61      0.73        18
          bahia       0.18      0.78      0.30        18
       botafogo       0.50      0.54      0.52        24
     bragantino       0.00      0.00      0.00        16
    chapecoense       0.03      0.07      0.04        14
    corinthians       0.33      0.32      0.33        47
       cruzeiro       0.00      0.00      0.00        20
    figueirense       1.00      0.67      0.80        18
       flamengo       0.48      0.71      0.57        66
     fluminense       0.74      0.65      0.69        26
          goias       0.90      0.45      0.60        20
         gremio       0.44      0.15      0.22        27
gremio-prudente       0.00      0.00      0.00        22
          icasa       0.

  'precision', 'predicted', average, warn_for)


In [11]:
#MODELO PARA DADOS RESAMPLEADOS NORMALIZADOS
us_data_X_norm = MinMaxScaler().fit_transform(undersampled_data.drop(['KEY','TIME'],axis=1))
X_train_norm,X_test_norm,y_train,y_test = train_test_split(us_data_X,us_data_y,\
                                                           random_state=42,test_size=0.2)
svc = LinearSVC().fit(X_train_norm,y_train)
print('svc done')
knn = KNeighborsClassifier(n_neighbors=10).fit(X_train_norm,y_train)
print('knn done')
tree = DecisionTreeClassifier().fit(X_train_norm,y_train)
print('tree done')
lr = LogisticRegression(max_iter=5000).fit(X_train_norm,y_train)

svc done
knn done
tree done


In [12]:
perm_svc = PermutationImportance(svc).fit(X_test_norm,y_test)
eli5.show_weights(perm_svc,feature_names=list(X_test.columns),top=None)

Weight,Feature
0.0399  ± 0.0146,palmeiras
0.0248  ± 0.0014,ponte-preta
0.0228  ± 0.0040,goias
0.0205  ± 0.0043,figueirense
0.0199  ± 0.0067,atletico-pr
0.0177  ± 0.0039,cruzeiro
0.0177  ± 0.0088,botafogo
0.0165  ± 0.0076,santos
0.0160  ± 0.0104,flamengo
0.0145  ± 0.0042,sport


In [13]:
perm_knn = PermutationImportance(knn).fit(X_test_norm,y_test)
eli5.show_weights(perm_knn,feature_names=list(X_test.columns),top=None)

Weight,Feature
0.0934  ± 0.0138,flamengo
0.0769  ± 0.0160,palmeiras
0.0655  ± 0.0143,sao-paulo
0.0652  ± 0.0170,santos
0.0624  ± 0.0127,corinthians
0.0533  ± 0.0157,internacional
0.0464  ± 0.0132,cruzeiro
0.0405  ± 0.0056,vasco
0.0390  ± 0.0104,gremio
0.0322  ± 0.0129,fluminense


In [14]:
perm_tree = PermutationImportance(tree).fit(X_test_norm,y_test)
eli5.show_weights(perm_tree,feature_names=list(X_test.columns),top=None)

Weight,Feature
0.0877  ± 0.0168,ESTADO_Sao Paulo
0.0704  ± 0.0181,ESTADO_Rio Grande do Sul
0.0573  ± 0.0180,flamengo
0.0470  ± 0.0107,corinthians
0.0436  ± 0.0086,ESTADO_Pernambuco
0.0382  ± 0.0061,ESTADO_Bahia
0.0370  ± 0.0062,ESTADO_Minas Gerais
0.0339  ± 0.0167,palmeiras
0.0333  ± 0.0082,sao-paulo
0.0288  ± 0.0071,ESTADO_Goias


In [15]:
perm_lr = PermutationImportance(lr).fit(X_test_norm,y_test)
eli5.show_weights(perm_lr,feature_names=list(X_test.columns),top=None)

Weight,Feature
0.0496  ± 0.0081,flamengo
0.0436  ± 0.0112,sao-paulo
0.0422  ± 0.0088,palmeiras
0.0379  ± 0.0053,internacional
0.0370  ± 0.0057,ponte-preta
0.0368  ± 0.0148,corinthians
0.0274  ± 0.0112,ESTADO_Rio Grande do Sul
0.0231  ± 0.0021,figueirense
0.0228  ± 0.0060,vasco
0.0222  ± 0.0064,santos


In [16]:
pred_svc = perm_svc.predict(X_test)
pred_knn = perm_knn.predict(X_test)
pred_tree = perm_tree.predict(X_test)
pred_lr = perm_lr.predict(X_test)
print('SVC: accuracy: {} \n'.format(np.mean(pred_svc==y_test))+classification_report(y_test,pred_svc))
print('KNN: accuracy: {} \n'.format(np.mean(pred_knn==y_test))+classification_report(y_test,pred_knn))
print('DTREE: accuracy: {} \n'.format(np.mean(pred_tree==y_test))+classification_report(y_test,pred_tree))
print('LR: accuracy: {} \n'.format(np.mean(pred_lr==y_test))+classification_report(y_test,pred_lr))

SVC: accuracy: 0.21367521367521367 
                 precision    recall  f1-score   support

            abc       0.23      0.15      0.18        20
     america-rn       0.00      0.00      0.00        20
    atletico-mg       0.43      0.22      0.29        27
    atletico-pr       0.65      0.61      0.63        18
          bahia       0.06      0.50      0.10        18
       botafogo       0.56      0.62      0.59        24
     bragantino       0.00      0.00      0.00        16
    chapecoense       0.00      0.00      0.00        14
    corinthians       0.00      0.00      0.00        47
       cruzeiro       0.00      0.00      0.00        20
    figueirense       1.00      0.67      0.80        18
       flamengo       0.04      0.02      0.02        66
     fluminense       0.00      0.00      0.00        26
          goias       1.00      0.65      0.79        20
         gremio       0.08      0.11      0.09        27
gremio-prudente       0.00      0.00      0.00     

  'precision', 'predicted', average, warn_for)


In [17]:
#MODELOS COM FEATURE SELECTION

columns = list(X_train.columns)
cutoff = 0
drop_feats_svc = perm_svc.feature_importances_ < cutoff
drop_feats_knn = perm_knn.feature_importances_ < cutoff
drop_feats_tree = perm_tree.feature_importances_ < cutoff
drop_feats_lr = perm_lr.feature_importances_ < cutoff

selected_svc_train = X_train.drop([columns[i] for i in range(len(columns)) if drop_feats_svc[i]],axis=1)
selected_knn_train = X_train.drop([columns[i] for i in range(len(columns)) if drop_feats_knn[i]],axis=1)
selected_tree_train = X_train.drop([columns[i] for i in range(len(columns)) if drop_feats_tree[i]],axis=1)
selected_lr_train = X_train.drop([columns[i] for i in range(len(columns)) if drop_feats_lr[i]],axis=1)
selected_svc_test = X_test.drop([columns[i] for i in range(len(columns)) if drop_feats_svc[i]],axis=1)
selected_knn_test = X_test.drop([columns[i] for i in range(len(columns)) if drop_feats_knn[i]],axis=1)
selected_tree_test = X_test.drop([columns[i] for i in range(len(columns)) if drop_feats_tree[i]],axis=1)
selected_lr_test = X_test.drop([columns[i] for i in range(len(columns)) if drop_feats_lr[i]],axis=1)

selected_svc = LinearSVC().fit(selected_svc_train,y_train)
print('svc done')
selected_knn = KNeighborsClassifier(n_neighbors=10).fit(selected_knn_train,y_train)
print('knn done')
selected_tree = DecisionTreeClassifier().fit(selected_tree_train,y_train)
print('tree done')
selected_lr = LogisticRegression(max_iter=5000).fit(selected_lr_train,y_train)

svc done
knn done
tree done


In [18]:
pred_svc = selected_svc.predict(selected_svc_test)
pred_knn = selected_knn.predict(selected_knn_test)
pred_tree = selected_tree.predict(selected_tree_test)
pred_lr = selected_lr.predict(selected_lr_test)
print('SVC: accuracy: {} \n'.format(np.mean(pred_svc==y_test))+classification_report(y_test,pred_svc))
print('KNN: accuracy: {} \n'.format(np.mean(pred_knn==y_test))+classification_report(y_test,pred_knn))
print('DTREE: accuracy: {} \n'.format(np.mean(pred_tree==y_test))+classification_report(y_test,pred_tree))
print('LR: accuracy: {} \n'.format(np.mean(pred_lr==y_test))+classification_report(y_test,pred_lr))

SVC: accuracy: 0.2378917378917379 
                 precision    recall  f1-score   support

            abc       0.23      0.15      0.18        20
     america-rn       0.00      0.00      0.00        20
    atletico-mg       0.57      0.30      0.39        27
    atletico-pr       0.00      0.00      0.00        18
          bahia       1.00      0.67      0.80        18
       botafogo       0.00      0.00      0.00        24
     bragantino       0.08      0.19      0.11        16
    chapecoense       0.00      0.00      0.00        14
    corinthians       0.45      0.38      0.41        47
       cruzeiro       0.61      0.70      0.65        20
    figueirense       1.00      0.56      0.71        18
       flamengo       0.00      0.00      0.00        66
     fluminense       0.00      0.00      0.00        26
          goias       0.40      0.60      0.48        20
         gremio       0.00      0.00      0.00        27
gremio-prudente       0.00      0.00      0.00      

  'precision', 'predicted', average, warn_for)


In [19]:
#SELECIONAMOS MELHORES MODELOS: LR E DECISION TREE COM FEATURE SELECTION PARA AVALIAR COM A BASE COMPLETA

#TREE
X_data = data.drop(['KEY','TIME'],axis=1)
y_data = data.TIME
X_data.drop([columns[i] for i in range(len(columns)) if drop_feats_tree[i]],axis=1,inplace=True)
X_data = MinMaxScaler().fit_transform(X_data)

In [20]:
pred_tree = selected_tree.predict(X_data)
print('DTREE: accuracy: {} \n'.format(np.mean(pred_tree==y_data))+classification_report(y_data,pred_tree))

DTREE: accuracy: 0.11634512197782448 
                 precision    recall  f1-score   support

            abc       0.00      0.00      0.00      1942
     america-rn       0.06      0.61      0.11      1165
    atletico-mg       0.17      0.65      0.27     29474
    atletico-pr       0.09      0.64      0.15      6827
          bahia       0.17      0.63      0.26     11915
       botafogo       0.00      0.00      0.00     27385
     bragantino       0.00      0.00      0.00       127
    chapecoense       0.00      0.00      0.00      2137
    corinthians       0.00      0.00      0.00    149301
       cruzeiro       0.00      0.00      0.00     48892
    figueirense       0.04      0.66      0.07      2114
       flamengo       0.00      0.00      0.00    194013
     fluminense       0.00      0.00      0.00     27225
          goias       0.06      0.61      0.11      2216
         gremio       0.04      0.00      0.01     43692
gremio-prudente       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


In [21]:
#LR

X_data = data.drop(['KEY','TIME'],axis=1)
y_data = data.TIME
X_data.drop([columns[i] for i in range(len(columns)) if drop_feats_lr[i]],axis=1,inplace=True)
X_data = MinMaxScaler().fit_transform(X_data)

In [22]:
pred_lr = selected_lr.predict(X_data)
print('LR: accuracy: {} \n'.format(np.mean(pred_lr==y_data))+classification_report(y_data,pred_lr))

LR: accuracy: 0.19943111990218088 
                 precision    recall  f1-score   support

            abc       0.00      0.00      0.00      1942
     america-rn       0.06      0.61      0.11      1165
    atletico-mg       0.00      0.00      0.00     29474
    atletico-pr       0.09      0.64      0.15      6827
          bahia       0.00      0.00      0.00     11915
       botafogo       0.00      0.00      0.00     27385
     bragantino       0.00      0.00      0.00       127
    chapecoense       0.03      0.51      0.06      2137
    corinthians       0.24      0.64      0.35    149301
       cruzeiro       0.28      0.64      0.39     48892
    figueirense       0.00      0.00      0.00      2114
       flamengo       0.00      0.00      0.00    194013
     fluminense       0.00      0.00      0.00     27225
          goias       0.06      0.61      0.11      2216
         gremio       0.41      0.47      0.44     43692
gremio-prudente       0.00      0.00      0.00      

  'precision', 'predicted', average, warn_for)
