In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from math import sqrt
import plotly
import plotly.offline as py
from plotly.offline import plot, iplot
plotly.offline.init_notebook_mode(connected=True)
from yellowbrick.features import FeatureImportances

In [2]:
from pycaret.classification import *
import scipy.stats as sct
import statsmodels.api as sm
import statsmodels.stats as st
from sklearn.decomposition import PCA
import warnings

In [3]:
mt = pd.read_csv('metadata.csv', sep=',')
mt

Unnamed: 0,Variavel cod,Variavel tipo
0,id,Qualitativo nominal
1,var1,Qualitativo nominal
2,var2,Qualitativo nominal
3,var3,Qualitativo nominal
4,var4,Qualitativo nominal
5,var5,Qualitativo nominal
6,var6,Qualitativo nominal
7,var7,Qualitativo nominal
8,var8,Qualitativo nominal
9,var9,Qualitativo nominal


In [4]:
warnings.filterwarnings("ignore")

plt.style.use('ggplot')

def titulo(title, y= 1):
    plt.title(title, fontsize= 20, color= 'grey', loc= 'left', fontweight= 'bold', y= y)

def labels(x, y):
    plt.xlabel(x)
    plt.ylabel(y)
    

def valor_grafico(data, porcentagem= 0):
    for patch in ax.patches:
        height= patch.get_height()
        width= patch.get_width()
        left_coord= patch.get_x()
        porcent= height / len(data) *100
        if porcentagem == 0:                
            ax.text(left_coord + width/2, height, height, ha= 'center')
        else:
            ax.text(left_coord + width/2, height, '{}({:.2f}%)'.format(height, porcent), ha= 'center')
        
def cramer_v(x, y):
    cm= np.asmatrix(pd.crosstab(x,y))
    n= cm.sum()
    r, k= cm.shape
    
    chi2= ss.chi2_contingency(cm)[0]
    chi2corr= max(0, chi2 - (k - 1) * (r-1)/n - 1)
    kcorr= k - (k - 1) ** 2 / (n - 1)
    rcorr= r - (r - 1) ** 2 / (n - 1)
    
    return np.sqrt((chi2corr / n) / (min(kcorr - 1, rcorr - 1)))

In [5]:
mt['Variavel tipo'].value_counts()

Qualitativo nominal      36
Quantitativo discreto    18
Quantitativo continua    12
Qualitativo ordinal       4
Name: Variavel tipo, dtype: int64

- Variável qualitativa nominal = valores que expressam atributos, sem nenhum tipo de ordem. Ex: cor dos olhos, sexo, estado civil, presença ou ausência...


- Variável qualitativa ordinal = valores que expressam atributos, porém com algum tipo de ordem, ou grau. Ex: grau de escolaridade (1º grau, 2º grau, 3ºgrau, pós-graduação...); resposta de um paciente (nenhuma melhora, alguma melhora, muita melhora); classe social (alta, média, baixa)... 


- Variável quantitativa discreta = valores observados somente em pontos isolados ao longo de uma escala de valores (contagem). Valores positivos inteiros (incluindo o zero). Ex: No de filhos; No de faltas; alunos com notas abaixo de 5,0. 


- Variável quantitativa contínua = valores em qualquer ponto fracionário ao longo de um intervalo especificado de valores (medição). Ex: temperatura do corpo; altura (em metros); índice do PIB...

In [6]:
train = pd.read_csv('train.csv', sep=',')
train

Unnamed: 0,id,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,var11,var12,var13,var14,var15,var16,var17,var18,var19,var20,var21,var22,var23,var24,var25,var26,var27,var28,var29,var30,var31,var32,var33,var34,var35,var36,var37,var38,var39,var40,var41,var42,var43,var44,var45,var46,var47,var48,var49,var50,var51,var52,var53,var54,var55,var56,var57,var58,var59,var60,var61,var62,var63,var64,var65,var66,var67,var68,y
0,1,18,19,2853,29442,1386,2435,35,-999,3,63,6498,1166,2007,26,13,11,11,4,1547,26,2068,1,3,0,4,7,0,24,4,2,0,16,3,44,463,27,2,0,4,9,3,25,6,1,4,3,1,0,0,0,0,42,1,1,0.212414,0.137,0.833333,0.037822,0.058070,0.311441,0.142303,0.056146,0.632694,0.024054,0.253356,0.00603,0.132353,0.139706,1
1,8,4,110,1986,13684,7189,-999,-999,17,3,63,13989,497,2289,16,1,3,3,3,1797,16,2417,5,1,2,1,6,1,4,2,1,0,14,0,1,532,2,1,0,4,7,3,30,3,0,0,0,0,0,0,0,0,20,1,1,0.228784,0.308,0.305376,0.069325,0.248909,-999.000000,-999.000000,0.070991,0.773966,0.019315,-999.000000,-999.00000,0.147059,0.106618,0
2,30,0,39,1019,10232,678,791,16,-999,3,63,9739,562,641,10,34,34,10,4,511,10,664,5,3,2,1,7,0,3,4,1,0,11,1,3,81,26,3,0,4,3,3,23,5,1,0,0,0,0,0,0,0,12,1,1,0.204636,0.213,0.451613,0.018639,0.214520,-999.000000,0.200814,0.051046,0.980827,0.018536,-999.000000,-999.00000,0.382353,0.242647,0
3,43,20,39,1751,2689,8235,1042,13,10,1,14,2890,6541,811,8,59,60,23,5,624,8,839,0,0,0,0,1,1,19,1,2,0,8,0,11,414,27,0,0,4,10,3,22,10,1,1,1,0,1,1,0,0,36,1,1,0.208299,0.716,0.101075,0.204753,0.349421,-999.000000,0.352379,0.044301,0.951564,0.023684,0.363370,0.00201,0.147059,0.132353,0
4,46,7,44,2262,29428,6031,304,16,-999,3,63,13541,7238,260,10,55,56,20,5,1413,10,270,2,3,2,1,8,1,3,4,1,0,17,4,3,567,8,3,2,4,5,3,26,5,1,0,0,0,0,1,0,0,19,1,1,0.222896,0.596,0.101075,0.140394,0.189641,0.021226,0.226161,0.059125,0.906155,0.020733,-999.000000,-999.00000,0.455882,0.132353,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14118,35295,4,39,2511,28766,1109,2094,31,24,3,-999,5082,-999,1678,23,55,56,20,5,198,23,1739,5,4,2,1,7,0,26,5,1,0,17,5,12,314,0,1,0,4,2,3,23,5,1,0,0,0,0,0,0,0,25,2,2,0.227307,0.048,0.978495,0.063199,0.122448,-999.000000,0.069347,0.036178,0.869828,0.016160,-999.000000,-999.00000,0.455882,0.147059,0
14119,35296,19,129,1114,-999,6376,-999,-999,27,-999,-999,-999,-999,1387,13,-999,-999,-999,-999,719,13,1445,5,1,1,2,-999,0,24,4,1,0,13,6,25,532,27,1,0,4,10,1,26,13,1,0,0,0,0,0,1,0,-999,0,0,0.210104,-999.000,-999.000000,-999.000000,-999.000000,-999.000000,0.295418,0.055899,0.921292,0.018293,-999.000000,-999.00000,0.147059,0.198529,0
14120,35301,27,44,1786,23761,9048,623,35,27,3,14,1249,882,358,26,34,34,10,4,482,26,380,5,3,2,1,7,0,24,4,1,0,17,3,44,362,2,0,0,4,1,3,25,4,1,0,0,0,0,0,0,0,16,2,2,0.217944,0.029,1.000000,0.034263,0.207001,-999.000000,0.203394,0.040697,0.930373,0.009440,-999.000000,-999.00000,0.397059,0.158088,0
14121,35304,4,89,210,19593,3634,2453,35,27,1,63,21128,3849,2033,26,55,56,20,5,1573,26,2093,3,3,0,0,7,0,24,4,2,0,16,2,24,553,0,18,0,4,1,3,28,14,1,0,0,0,0,1,0,1,21,2,2,0.209336,0.154,0.833333,0.022503,0.200405,0.070788,0.381038,0.100384,0.680121,0.041096,-999.000000,-999.00000,0.264706,0.128676,0


In [7]:
test = pd.read_csv('test.csv', sep=',')
test

Unnamed: 0,id,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,var11,var12,var13,var14,var15,var16,var17,var18,var19,var20,var21,var22,var23,var24,var25,var26,var27,var28,var29,var30,var31,var32,var33,var34,var35,var36,var37,var38,var39,var40,var41,var42,var43,var44,var45,var46,var47,var48,var49,var50,var51,var52,var53,var54,var55,var56,var57,var58,var59,var60,var61,var62,var63,var64,var65,var66,var67,var68
0,0,5,126,1353,28956,743,1289,27,-999,1,33,4530,914,991,19,1,3,3,3,1155,19,1031,5,3,2,3,7,0,25,4,1,0,11,2,62,413,27,0,0,4,6,3,24,3,1,0,0,0,0,0,0,0,44,1,1,0.217528,0.272,0.367742,0.062900,0.201839,0.353965,0.166641,0.049108,0.986882,0.016683,-999.0,-999.0,0.176471,0.253676
1,2,6,126,1446,7803,5151,935,35,-999,3,63,8731,1341,2033,26,58,58,22,5,1299,26,773,5,3,1,2,1,0,25,4,0,1,6,5,58,692,21,15,8,4,0,1,30,0,1,0,0,0,0,0,0,0,13,1,1,0.221968,0.853,0.053763,0.177047,0.072127,0.074555,0.217009,0.144403,0.892028,0.038323,-999.0,-999.0,0.147059,0.099265
2,4,5,44,243,4325,1109,1903,33,24,1,63,10131,914,1503,24,60,61,23,5,1294,24,1562,5,4,1,0,1,0,26,5,2,0,17,0,12,553,0,18,0,4,10,3,26,14,1,0,0,0,0,0,0,0,36,1,1,0.213224,0.632,0.101075,0.210879,0.324770,0.384992,0.330680,0.072864,0.930373,0.021052,-999.0,-999.0,0.294118,0.136029
3,7,4,53,419,743,7750,183,35,-999,3,14,636,5879,146,26,22,22,10,4,811,26,152,5,3,2,4,4,0,24,4,2,0,12,5,38,662,28,1,0,4,9,3,28,7,1,0,0,0,0,0,1,0,13,1,1,0.205044,0.117,0.935484,0.007068,0.131070,-999.000000,0.244936,0.158088,0.986882,0.022649,-999.0,-999.0,0.294118,0.220588
4,15,4,126,1863,22693,5625,965,9,-999,3,63,24967,4427,772,5,73,73,29,5,595,5,796,0,1,0,1,7,0,11,2,1,0,21,3,18,546,0,1,0,4,6,3,28,8,1,0,0,0,0,1,0,0,51,1,1,0.203750,0.079,0.967742,0.024989,0.225166,0.059940,0.252794,0.080405,0.944501,0.021806,-999.0,-999.0,0.352941,0.113971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21178,35297,4,126,1460,13335,9048,620,35,27,3,63,13567,2617,572,26,37,35,11,4,1573,26,592,5,3,2,1,1,0,24,4,1,0,21,6,42,332,21,20,9,1,11,3,25,13,1,0,0,0,0,0,0,0,34,2,2,0.213106,0.065,0.967742,0.387701,0.148933,-999.000000,0.326307,0.132833,0.968718,0.039626,-999.0,-999.0,0.323529,0.253676
21179,35298,18,19,532,14837,2590,855,27,20,3,63,15367,2261,678,19,-999,-999,-999,-999,1383,19,700,0,3,2,1,-999,0,25,4,1,0,20,0,60,578,2,0,0,4,2,3,26,6,1,0,0,0,0,0,0,0,10,2,2,0.210435,0.740,0.101075,-999.000000,0.179243,0.205030,0.229354,0.052108,0.940464,0.016952,-999.0,-999.0,0.088235,0.209559
21180,35300,4,126,2923,16685,3162,1604,35,-999,3,63,17658,4369,1279,26,-999,-999,-999,-999,971,26,1334,5,3,0,1,-999,0,24,4,1,0,12,5,49,74,28,2,0,4,4,3,19,4,1,0,0,0,0,0,0,0,39,2,2,0.218353,0.288,0.367742,-999.000000,0.237607,0.810448,0.179781,0.029155,0.745711,0.020158,-999.0,-999.0,0.205882,0.161765
21181,35302,5,-999,-999,367,7637,1389,35,-999,3,63,28370,679,1117,26,34,34,10,4,868,26,1158,5,3,2,1,9,0,24,4,1,0,15,5,54,434,0,1,0,4,19,3,23,6,1,0,0,0,0,0,0,0,3,2,2,0.224865,0.324,0.246237,0.199008,0.102662,-999.000000,0.149598,0.032583,0.891019,0.012596,-999.0,-999.0,-999.000000,0.246324


In [8]:
print('O dataset possui {} linhas.'.format(train.shape[0]))
print('O dataset possui {} colunas.'.format(train.shape[1]))

O dataset possui 14123 linhas.
O dataset possui 70 colunas.


In [9]:
train

Unnamed: 0,id,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,var11,var12,var13,var14,var15,var16,var17,var18,var19,var20,var21,var22,var23,var24,var25,var26,var27,var28,var29,var30,var31,var32,var33,var34,var35,var36,var37,var38,var39,var40,var41,var42,var43,var44,var45,var46,var47,var48,var49,var50,var51,var52,var53,var54,var55,var56,var57,var58,var59,var60,var61,var62,var63,var64,var65,var66,var67,var68,y
0,1,18,19,2853,29442,1386,2435,35,-999,3,63,6498,1166,2007,26,13,11,11,4,1547,26,2068,1,3,0,4,7,0,24,4,2,0,16,3,44,463,27,2,0,4,9,3,25,6,1,4,3,1,0,0,0,0,42,1,1,0.212414,0.137,0.833333,0.037822,0.058070,0.311441,0.142303,0.056146,0.632694,0.024054,0.253356,0.00603,0.132353,0.139706,1
1,8,4,110,1986,13684,7189,-999,-999,17,3,63,13989,497,2289,16,1,3,3,3,1797,16,2417,5,1,2,1,6,1,4,2,1,0,14,0,1,532,2,1,0,4,7,3,30,3,0,0,0,0,0,0,0,0,20,1,1,0.228784,0.308,0.305376,0.069325,0.248909,-999.000000,-999.000000,0.070991,0.773966,0.019315,-999.000000,-999.00000,0.147059,0.106618,0
2,30,0,39,1019,10232,678,791,16,-999,3,63,9739,562,641,10,34,34,10,4,511,10,664,5,3,2,1,7,0,3,4,1,0,11,1,3,81,26,3,0,4,3,3,23,5,1,0,0,0,0,0,0,0,12,1,1,0.204636,0.213,0.451613,0.018639,0.214520,-999.000000,0.200814,0.051046,0.980827,0.018536,-999.000000,-999.00000,0.382353,0.242647,0
3,43,20,39,1751,2689,8235,1042,13,10,1,14,2890,6541,811,8,59,60,23,5,624,8,839,0,0,0,0,1,1,19,1,2,0,8,0,11,414,27,0,0,4,10,3,22,10,1,1,1,0,1,1,0,0,36,1,1,0.208299,0.716,0.101075,0.204753,0.349421,-999.000000,0.352379,0.044301,0.951564,0.023684,0.363370,0.00201,0.147059,0.132353,0
4,46,7,44,2262,29428,6031,304,16,-999,3,63,13541,7238,260,10,55,56,20,5,1413,10,270,2,3,2,1,8,1,3,4,1,0,17,4,3,567,8,3,2,4,5,3,26,5,1,0,0,0,0,1,0,0,19,1,1,0.222896,0.596,0.101075,0.140394,0.189641,0.021226,0.226161,0.059125,0.906155,0.020733,-999.000000,-999.00000,0.455882,0.132353,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14118,35295,4,39,2511,28766,1109,2094,31,24,3,-999,5082,-999,1678,23,55,56,20,5,198,23,1739,5,4,2,1,7,0,26,5,1,0,17,5,12,314,0,1,0,4,2,3,23,5,1,0,0,0,0,0,0,0,25,2,2,0.227307,0.048,0.978495,0.063199,0.122448,-999.000000,0.069347,0.036178,0.869828,0.016160,-999.000000,-999.00000,0.455882,0.147059,0
14119,35296,19,129,1114,-999,6376,-999,-999,27,-999,-999,-999,-999,1387,13,-999,-999,-999,-999,719,13,1445,5,1,1,2,-999,0,24,4,1,0,13,6,25,532,27,1,0,4,10,1,26,13,1,0,0,0,0,0,1,0,-999,0,0,0.210104,-999.000,-999.000000,-999.000000,-999.000000,-999.000000,0.295418,0.055899,0.921292,0.018293,-999.000000,-999.00000,0.147059,0.198529,0
14120,35301,27,44,1786,23761,9048,623,35,27,3,14,1249,882,358,26,34,34,10,4,482,26,380,5,3,2,1,7,0,24,4,1,0,17,3,44,362,2,0,0,4,1,3,25,4,1,0,0,0,0,0,0,0,16,2,2,0.217944,0.029,1.000000,0.034263,0.207001,-999.000000,0.203394,0.040697,0.930373,0.009440,-999.000000,-999.00000,0.397059,0.158088,0
14121,35304,4,89,210,19593,3634,2453,35,27,1,63,21128,3849,2033,26,55,56,20,5,1573,26,2093,3,3,0,0,7,0,24,4,2,0,16,2,24,553,0,18,0,4,1,3,28,14,1,0,0,0,0,1,0,1,21,2,2,0.209336,0.154,0.833333,0.022503,0.200405,0.070788,0.381038,0.100384,0.680121,0.041096,-999.000000,-999.00000,0.264706,0.128676,0


In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
info = pd.DataFrame({'type': train.dtypes,
                            'percentage': (train.isna().sum() / train.shape[0]) * 100,
                            'unique values': train.nunique(),
                            'mediana': train.median(),
                            'média': train.mean(),
                            'desvio_padrão': train.std(),
                            'assimetria': train.skew(),
                            'Curtose': train.kurtosis(),
                            'Variância': train.var(),
                            'Máximo': train.max(),
                            'Minimo': train.min()})
                            
info

Unnamed: 0,type,percentage,unique values,mediana,média,desvio_padrão,assimetria,Curtose,Variância,Máximo,Minimo
id,int64,0.0,14123,17464.0,17474.649366,10249.066602,0.016119,-1.201419,105043400.0,35306.0,1.0
var1,int64,0.0,29,5.0,9.712667,7.269559,0.550727,-1.384229,52.84648,30.0,0.0
var2,int64,0.0,85,53.0,23.966579,218.429664,-4.328588,17.425478,47711.52,129.0,-999.0
var3,int64,0.0,2443,1461.0,1584.671245,1118.101765,-0.121279,-0.589888,1250152.0,3546.0,-999.0
var4,int64,0.0,13094,15309.0,15179.176025,9994.295223,-0.013928,-1.207012,99885940.0,32403.0,-999.0
var5,int64,0.0,6296,6159.0,5543.697869,3329.493664,-0.057312,-1.269508,11085530.0,11373.0,0.0
var6,int64,0.0,1779,1558.0,1315.314168,1210.074896,-0.633578,-0.749957,1464281.0,3001.0,-999.0
var7,int64,0.0,33,33.0,-109.713729,350.20242,-2.14407,2.602658,122641.7,36.0,-999.0
var8,int64,0.0,29,24.0,-138.42668,371.128392,-1.886966,1.563256,137736.3,28.0,-999.0
var9,int64,0.0,4,3.0,-106.362388,311.608454,-2.515883,4.330356,97099.83,3.0,-999.0


In [12]:
import sweetviz as sv
import dtale


In [13]:
del train['id']

In [14]:
my_report = sv.analyze(train, target_feat='y')
my_report.show_html() 

                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [15]:
df = dtale.show(train)
df



## 1.1 Estatística Descritiva

In [16]:
# Variáveis Numéricas
num_att= train[['var1','var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var10', 'var11', 'var12', 'var13',
             'var14', 'var15','var16', 'var17', 'var19', 'var20', 'var21', 'var26', 'var28', 'var32', 'var34',
             'var35', 'var36', 'var37', 'var38', 'var40', 'var42', 'var43', 'var45', 'var46', 'var52', 'var55',
             'var56', 'var57', 'var58', 'var59', 'var60', 'var61', 'var62', 'var63', 'var64', 'var65', 'var66',
             'var67', 'var68']]


# Variáveis Categóricas
cat_att= train[['var9', 'var18', 'var22', 'var23', 'var24', 'var25', 'var27', 'var29', 'var30', 'var31', 'var33',
               'var39', 'var41', 'var44', 'var47', 'var48', 'var49', 'var50', 'var51', 'var53', 'var54', 'y']]

### 1.1.1. Atributos Numéricos

In [17]:
num_att.describe().round(4)

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var10,var11,var12,var13,var14,var15,var16,var17,var19,var20,var21,var26,var28,var32,var34,var35,var36,var37,var38,var40,var42,var43,var45,var46,var52,var55,var56,var57,var58,var59,var60,var61,var62,var63,var64,var65,var66,var67,var68
count,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0,14123.0
mean,9.7127,23.9666,1584.6712,15179.176,5543.6979,1315.3142,-109.7137,-138.4267,-118.9867,13342.2999,3430.0811,1355.3955,19.481,-65.8311,-65.1804,-84.4229,1059.0386,19.7937,1394.8204,-106.1859,19.2375,13.7145,26.6034,394.255,14.3312,7.5856,1.867,6.0103,24.737,6.8551,0.2752,0.2216,-85.2391,0.216,-111.8225,-112.0921,-111.0221,-154.1579,-458.5403,-24.2103,0.0731,0.7792,0.0288,-862.722,-862.7621,-41.2528,-2.1548
std,7.2696,218.4297,1118.1018,9994.2952,3329.4937,1210.0749,350.2024,371.1284,392.4099,9849.8075,3079.8991,642.7993,7.512,307.5026,307.6919,300.5026,537.3776,7.5908,688.8721,315.8798,8.2785,4.9538,18.7932,181.8687,12.4842,8.724,4.0664,4.7832,3.4321,4.0482,0.9147,0.8034,319.2462,0.0226,315.6713,315.7994,314.1588,361.1586,497.9456,154.4853,0.054,0.2341,0.0187,342.9569,342.8559,199.2755,48.2442
min,0.0,-999.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0.0,-999.0,-999.0,-999.0,0.0,0.0,1.0,-999.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0.0,0.0043,-999.0,-999.0,-999.0,-999.0
25%,4.0,38.0,731.0,6493.5,2374.0,482.0,16.0,10.0,14.0,4525.5,914.0,811.0,13.0,2.0,3.0,3.0,586.0,13.0,796.0,1.0,15.0,10.0,10.0,255.0,2.0,1.0,0.0,2.0,23.0,4.0,0.0,0.0,12.0,0.2052,0.117,0.0312,0.0175,0.0979,-999.0,0.1694,0.0404,0.7064,0.0184,-999.0,-999.0,0.1176,0.136
50%,5.0,53.0,1461.0,15309.0,6159.0,1558.0,33.0,24.0,63.0,13232.0,3213.0,1387.0,24.0,28.0,32.0,10.0,1245.0,26.0,1445.0,5.0,24.0,14.0,25.0,435.0,14.0,3.0,0.0,5.0,25.0,6.0,0.0,0.0,22.0,0.2095,0.371,0.1011,0.0484,0.1795,0.009,0.2396,0.0603,0.8779,0.0235,-999.0,-999.0,0.1765,0.1765
75%,18.0,110.0,2481.0,23818.0,8594.0,2438.0,35.0,27.0,63.0,21839.5,6229.0,2008.0,26.0,58.0,58.0,22.0,1573.0,26.0,2093.0,7.0,24.0,18.0,44.0,533.0,27.0,13.0,0.0,10.0,27.0,9.0,0.0,0.0,34.0,0.2183,0.784,0.4516,0.142,0.2578,0.1523,0.3419,0.0888,0.9304,0.0346,-999.0,-999.0,0.2647,0.2206
max,30.0,129.0,3546.0,32403.0,11373.0,3001.0,36.0,28.0,85.0,30859.0,8800.0,2299.0,27.0,78.0,78.0,32.0,1803.0,27.0,2436.0,10.0,27.0,23.0,68.0,696.0,33.0,56.0,30.0,20.0,31.0,20.0,15.0,15.0,64.0,0.7509,1.0,1.0,1.0,0.9138,1.0,0.935,0.8148,1.0,0.3874,1.0,1.0,0.9118,1.0


### 1.1.2. Atributos Categóricos

In [18]:
for i in cat_att.columns:
    print(i)
    print('Possui {} valores únicos'.format(len(train[i].unique())))
    print('Os valores únicos são: {}'.format(train[i].unique()))
    print('-'*100)

var9
Possui 4 valores únicos
Os valores únicos são: [   3    1 -999    2]
----------------------------------------------------------------------------------------------------
var18
Possui 7 valores únicos
Os valores únicos são: [   4    3    5 -999    2    6    1]
----------------------------------------------------------------------------------------------------
var22
Possui 6 valores únicos
Os valores únicos são: [1 5 0 2 4 3]
----------------------------------------------------------------------------------------------------
var23
Possui 5 valores únicos
Os valores únicos são: [3 1 0 2 4]
----------------------------------------------------------------------------------------------------
var24
Possui 3 valores únicos
Os valores únicos são: [0 2 1]
----------------------------------------------------------------------------------------------------
var25
Possui 5 valores únicos
Os valores únicos são: [4 1 0 3 2]
-------------------------------------------------------------------------

In [19]:
for i in num_att.columns:
    print(i)
    print('Possui {} valores únicos'.format(len(train[i].unique())))
    print('Os valores únicos são: {}'.format(train[i].unique()))
    print('-'*100)

var1
Possui 29 valores únicos
Os valores únicos são: [18  4  0 20  7  5 21  2 19  6 27 30  1  8 16 15 23 24 26  3 28 22 10 11
 17 14 13 12 25]
----------------------------------------------------------------------------------------------------
var2
Possui 85 valores únicos
Os valores únicos são: [  19  110   39   44 -999   89   53  126   38  121  129   57   74   11
   92  111   68  101  119   99  125   22   73  120   62   66  116   26
   86   79  112   82   69   14   45   18  106   60   30  113    5   72
    9   54  108  117  122   49   76   21  123   16  103   80   51   75
   52    6   17   50   94  118   67  104   28   24   77  102  124   64
   81   91    8    1   13    7   59   42   15   58   37   55  128   56
    2]
----------------------------------------------------------------------------------------------------
var3
Possui 2443 valores únicos
Os valores únicos são: [2853 1986 1019 ...   26 2770 2511]
------------------------------------------------------------------------------

# 2.0 - Análise Exploratória dos Dados

In [20]:
train2 = train.copy()

## 2.1 -  Análise Univariada

### 2.1.1 - Variável Resposta

In [22]:
plt.figure(figsize=(15,5))
splot=sns.countplot(data=train2,x='y',palette='GnBu')
sns.set_style('ticks')
total = float(len(train2))
for p in splot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    splot.annotate(percentage,(x,y),ha = 'center', va = 'center')
plt.title("% Total de aquisição produto")
plt.xlabel("Categorias")
plt.ylabel("Aquisição de Produto")

Text(0, 0.5, 'Aquisição de Produto')

### 2.1.1 - Variável Numérica Contínua

In [23]:
sns.set_style('dark')
cat_att.hist(bins=30,figsize=(30,30),color='navy',);

In [24]:
sns.set_style('dark')
num_att.hist(bins=30,figsize=(30,30),color='navy',);

In [25]:
sns.heatmap(cat_att.corr(),cmap='coolwarm',annot=True)
plt.rcParams['figure.figsize'] = (25.5, 25.5)
plt.title('Correlação entre as variáveis ', fontsize=15);

In [26]:
sns.heatmap(num_att.corr(),cmap='coolwarm',annot=True)
plt.rcParams['figure.figsize'] = (50.5, 50.5)
plt.title('Correlação entre as variáveis ', fontsize=15);

In [27]:
plt.figure(figsize= (30, 20.5))

n= 1
for i in train[num_att.columns]:
    plt.subplot(47, 2, n)
    sns.distplot(train[i])
    titulo(i)
    labels('', 'Densidade')
    n+= 1

plt.tight_layout()

In [29]:
plt.figure(figsize= (30, 20.5))

n= 1
for i in train[num_att.columns]:    
    plt.subplot(47, 2, n)
    sns.boxplot(x= train[i])
    titulo(i)
    labels('', '')
    n+= 1

plt.tight_layout()

Apesar do gráfico não ter ficado muito bom, decorrente do da grande quantidade de variáveis numérica, podemos tirar duas conclusões:
- Nenhuma das variáveis, apresentou uma densidade bem distribuída, parecendo um sino. 
- Muitas variáveis apresentaram outliers. Esses terão que ser melhor inspecionados e analisados. 

# Pré Processamento dos Dados

Antes de comerçarmos a etapa de Pré Processamento, vamos transformar as variáveis qualitativas nominais, qualitativas ordinais e quantitativo discreto com o valor de **-999** em **0**. Isso porque há um padrão nos dados. 

In [28]:
train['var2'] = train['var2'].replace([-999],0)
train['var3'] = train['var3'].replace([-999],0)
train['var4'] = train['var4'].replace([-999],0)
train['var6'] = train['var6'].replace([-999],0)
train['var7'] = train['var7'].replace([-999],0)
train['var8'] = train['var8'].replace([-999],0)
train['var9'] = train['var9'].replace([-999],0)
train['var10'] = train['var10'].replace([-999],0)
train['var11'] = train['var11'].replace([-999],0)
train['var12'] = train['var12'].replace([-999],0)
train['var15'] = train['var15'].replace([-999],0)
train['var16'] = train['var16'].replace([-999],0)
train['var17'] = train['var17'].replace([-999],0)
train['var18'] = train['var18'].replace([-999],0)
train['var26'] = train['var26'].replace([-999],0)
train['var52'] = train['var52'].replace([-999],0)
train['var67'] = train['var67'].replace([-999],0)
train['var68'] = train['var68'].replace([-999],0)

In [29]:
Q1 = train[num_att.columns].quantile(0.25)
Q3 = train[num_att.columns].quantile(0.75)
IQR = Q3 - Q1

In [30]:
#we handled outlier now by make point with in Q1 and Q3
for col in train[num_att.columns].columns:
    train[num_att.columns].loc[(train[num_att.columns][col] < (Q1[col] - 1.5 * IQR[col])) | (train[num_att.columns][col] > (Q3[col] + 1.5 * IQR[col])), col] = train[num_att.columns][col].median()

Pronto, após transformação dos dados, vamos treinar o nosso modelo com os 2 dataframes: um com os outliers tratados e o outro com os dados originais.

In [31]:
info

Unnamed: 0,type,percentage,unique values,mediana,média,desvio_padrão,assimetria,Curtose,Variância,Máximo,Minimo
id,int64,0.0,14123,17464.0,17474.649366,10249.066602,0.016119,-1.201419,105043400.0,35306.0,1.0
var1,int64,0.0,29,5.0,9.712667,7.269559,0.550727,-1.384229,52.84648,30.0,0.0
var2,int64,0.0,85,53.0,23.966579,218.429664,-4.328588,17.425478,47711.52,129.0,-999.0
var3,int64,0.0,2443,1461.0,1584.671245,1118.101765,-0.121279,-0.589888,1250152.0,3546.0,-999.0
var4,int64,0.0,13094,15309.0,15179.176025,9994.295223,-0.013928,-1.207012,99885940.0,32403.0,-999.0
var5,int64,0.0,6296,6159.0,5543.697869,3329.493664,-0.057312,-1.269508,11085530.0,11373.0,0.0
var6,int64,0.0,1779,1558.0,1315.314168,1210.074896,-0.633578,-0.749957,1464281.0,3001.0,-999.0
var7,int64,0.0,33,33.0,-109.713729,350.20242,-2.14407,2.602658,122641.7,36.0,-999.0
var8,int64,0.0,29,24.0,-138.42668,371.128392,-1.886966,1.563256,137736.3,28.0,-999.0
var9,int64,0.0,4,3.0,-106.362388,311.608454,-2.515883,4.330356,97099.83,3.0,-999.0


In [32]:
plt.figure(figsize=(15,8))
sns.kdeplot(train[train['y']==1]['var2'],color='green',label='Resultado: Sim')
sns.kdeplot(train[train['y']==0]['var2'],color='red',label='Resultado: Não')
plt.title('Distribuição da Variável em relação ao y')
plt.show()

plt.figure(figsize=(15,8))
sns.kdeplot(train[train['y']==1]['var7'],color='green',label='Resultado: Sim')
sns.kdeplot(train[train['y']==0]['var7'],color='red',label='Resultado: Não')
plt.title('Distribuição da Variável em relação ao y')
plt.show()

plt.figure(figsize=(15,8))
sns.kdeplot(train[train['y']==1]['var8'],color='green',label='Resultado: Sim')
sns.kdeplot(train[train['y']==0]['var8'],color='red',label='Resultado: Não')
plt.title('Distribuição da Variável em relação ao y')
plt.show()

plt.figure(figsize=(15,8))
sns.kdeplot(train[train['y']==1]['var9'],color='green',label='Resultado: Sim')
sns.kdeplot(train[train['y']==0]['var9'],color='red',label='Resultado: Não')
plt.title('Distribuição da Variável em relação ao y')
plt.show()


Só pelos valores de assimetria e curtose, e por 4 gráficos, já percebemos que não há uma distribuição normal nos dados. Portanto, teremos que transformar esses dados para que possamos utilizar no nosso modelo. 

In [33]:
train.to_csv('train1.csv',index=False)

Assimetria normal: valores entre -1 e +1

Kurtose normal: valores entre -3 e +3

variáveis para verificar com gráfico:

var2, 7, 8 , 9,  10, 15, 16, 17, 18, 23, 26, 28, 31, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67

In [34]:
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler


# Normalizer - 
norm= Normalizer()

train[['var2', 'var3', 'var7', 'var8', 'var9', 'var10', 'var15', 'var16', 'var17', 'var18', 'var23', 'var26', 'var28', 'var31', 'var37', 'var38', 'var39', 'var41', 'var42', 'var43', 'var44', 'var45', 'var46', 'var47', 'var48', 'var50', 'var51', 'var52', 'var53', 'var55', 'var56', 'var57', 'var58', 'var59', 'var61', 'var62', 'var63', 'var64', 'var65', 'var66', 'var67']]= norm.fit_transform(train[['var2', 'var3', 'var7', 'var8', 'var9', 'var10', 'var15', 'var16', 'var17', 'var18', 'var23', 'var26', 'var28', 'var31', 'var37', 'var38', 'var39', 'var41', 'var42', 'var43', 'var44', 'var45', 'var46', 'var47', 'var48', 'var50', 'var51', 'var52', 'var53', 'var55', 'var56', 'var57', 'var58', 'var59', 'var61', 'var62', 'var63', 'var64', 'var65', 'var66', 'var67']])

scaler = StandardScaler()
train[['var1', 'var4', 'var5', 'var6', 'var11', 'var12', 'var13', 'var14', 'var19', 'var20', 'var21', 'var22', 'var24', 'var25', 'var27', 'var29', 'var30', 'var32', 'var33', 'var34', 'var35', 'var36', 'var40', 'var49', 'var54', 'var60']]=scaler.fit_transform(train[['var1', 'var4', 'var5', 'var6', 'var11', 'var12', 'var13', 'var14', 'var19', 'var20', 'var21', 'var22', 'var24', 'var25', 'var27', 'var29', 'var30', 'var32', 'var33', 'var34', 'var35', 'var36', 'var40', 'var49', 'var54', 'var60']])



In [35]:
train

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,var11,var12,var13,var14,var15,var16,var17,var18,var19,var20,var21,var22,var23,var24,var25,var26,var27,var28,var29,var30,var31,var32,var33,var34,var35,var36,var37,var38,var39,var40,var41,var42,var43,var44,var45,var46,var47,var48,var49,var50,var51,var52,var53,var54,var55,var56,var57,var58,var59,var60,var61,var62,var63,var64,var65,var66,var67,var68,y
0,1.140045,0.006656,0.999441,1.435552,-1.248792,1.004412,0.012261,0.000000,0.001051,0.022070,-0.719892,-0.837871,1.013734,0.867849,0.004554,0.003853,0.003853,0.001401,0.908074,0.817640,0.977255,-1.367080,0.001051,-1.047096,1.373163,0.002452,-0.766413,0.008407,0.335004,1.101194,0.0,0.461380,-0.071247,0.925718,0.378006,1.014829,0.000701,0.000000,0.001401,0.625059,0.001051,0.008758,0.002102,0.000350,0.001401,0.001051,0.00035,0.00000,-0.723658,0.000000,0.000000,0.014713,0.000350,-0.727220,0.000074,0.000048,0.000292,0.000013,0.000020,0.921522,0.000050,0.000020,0.000222,0.000008,0.000089,0.000002,0.000046,0.139706,1
1,-0.785862,0.041707,0.753009,-0.157886,0.494177,-1.472693,0.000000,0.006446,0.001137,0.023887,0.054267,-1.070179,1.452456,-0.463409,0.000379,0.001137,0.001137,0.001137,1.373313,-0.499789,1.483898,0.839150,0.000379,1.137429,-0.787236,0.002275,1.304779,0.001517,-1.547712,-0.694875,0.0,0.057633,-1.576823,-1.362424,0.757413,-0.987780,0.000379,0.000000,0.001517,0.206912,0.001137,0.011375,0.001137,0.000000,0.000000,0.000000,0.00000,0.00000,-0.723658,0.000000,0.000000,0.007583,0.000379,-0.727220,0.000087,0.000117,0.000116,0.000026,0.000094,-1.085417,-0.378779,0.000027,0.000293,0.000007,-0.378779,-0.378779,0.000056,0.106618,0
2,-1.336121,0.022356,0.584115,-0.506950,-1.461444,-0.668015,0.009172,0.000000,0.001720,0.036113,-0.384950,-1.047608,-1.111421,-1.262164,0.019490,0.019490,0.005732,0.002293,-1.019875,-1.290247,-1.060932,0.839150,0.001720,1.137429,-0.787236,0.004013,-0.766413,0.001720,0.335004,-0.694875,0.0,-0.547987,-1.074964,-1.255999,-1.722485,0.934725,0.001720,0.000000,0.002293,-0.629381,0.001720,0.013184,0.002866,0.000573,0.000000,0.000000,0.00000,0.00000,-0.723658,0.000000,0.000000,0.006879,0.000573,-0.727220,0.000117,0.000122,0.000259,0.000011,0.000123,-1.085417,0.000115,0.000029,0.000562,0.000011,-0.572651,-0.572651,0.000219,0.242647,0
3,1.415175,0.022230,0.998066,-1.269693,0.808350,-0.412675,0.007410,0.005700,0.000570,0.007980,-1.092762,1.028572,-0.846944,-1.528416,0.033630,0.034200,0.013110,0.002850,-0.809587,-1.553733,-0.806884,-1.918638,0.000000,-1.047096,-1.507369,0.000570,1.304779,0.010830,-2.489071,1.101194,0.0,-1.153606,-1.576823,-0.830298,0.108571,1.014829,0.000000,0.000000,0.002280,0.834132,0.001710,0.012540,0.005700,0.000570,0.000570,0.000570,0.00000,0.00057,1.381868,0.000000,0.000000,0.020520,0.000570,-0.727220,0.000119,0.000408,0.000058,0.000117,0.000199,-1.085417,0.000201,0.000025,0.000542,0.000013,0.000207,0.000001,0.000084,0.132353,0
4,-0.373168,0.016482,0.847328,1.434137,0.146364,-1.163436,0.005993,0.000000,0.001124,0.023599,0.007968,1.270602,-1.704162,-1.262164,0.020603,0.020977,0.007492,0.001873,0.658706,-1.290247,-1.632901,-0.815523,0.001124,1.137429,-0.787236,0.002997,1.304779,0.001124,0.335004,-0.694875,0.0,0.663253,0.430611,-1.255999,0.949867,-0.507154,0.001124,0.000749,0.001498,-0.211235,0.001124,0.009739,0.001873,0.000375,0.000000,0.000000,0.00000,0.00000,1.381868,0.000000,0.000000,0.007117,0.000375,-0.727220,0.000083,0.000223,0.000038,0.000053,0.000071,0.920939,0.000085,0.000022,0.000339,0.000008,-0.374218,-0.374218,0.000171,0.132353,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14118,-0.785862,0.013527,0.870913,1.367196,-1.331991,0.657516,0.010752,0.008324,0.001041,0.000000,-0.866229,-1.242760,0.501892,0.468471,0.019076,0.019423,0.006937,0.001734,-1.602354,0.422411,0.499645,0.839150,0.001387,1.137429,-0.787236,0.002428,-0.766413,0.009018,1.276363,-0.694875,0.0,0.663253,0.932470,-0.777086,-0.441296,-1.147989,0.000347,0.000000,0.001387,-0.838455,0.001041,0.007977,0.001734,0.000347,0.000000,0.000000,0.00000,0.00000,-0.723658,0.000000,0.000000,0.008671,0.000694,0.673371,0.000079,0.000017,0.000339,0.000022,0.000042,-1.085417,0.000024,0.000013,0.000302,0.000006,-0.346492,-0.346492,0.000158,0.147059,0
14119,1.277610,0.047917,0.413792,-1.541603,0.249987,-1.472693,0.000000,0.010029,0.000000,0.000000,-1.391429,-1.242760,0.049169,-0.862787,0.000000,0.000000,0.000000,0.000000,-0.632796,-0.895018,0.072846,0.839150,0.000371,0.045166,-0.067103,0.000000,-0.766413,0.008915,0.335004,-0.694875,0.0,-0.144240,1.434328,-0.085322,0.757413,1.014829,0.000371,0.000000,0.001486,0.834132,0.000371,0.009658,0.004829,0.000371,0.000000,0.000000,0.00000,0.00000,-0.723658,0.000371,0.000000,0.000000,0.000000,-2.127812,0.000078,-0.371075,-0.371075,-0.371075,-0.371075,-1.085417,0.000110,0.000021,0.000342,0.000007,-0.371075,-0.371075,0.000055,0.198529,0
14120,2.378129,0.019307,0.783674,0.861093,1.052540,-0.838920,0.015358,0.011847,0.001316,0.006143,-1.262351,-0.936489,-1.551699,0.867849,0.014919,0.014919,0.004388,0.001755,-1.073843,0.817640,-1.473214,0.839150,0.001316,1.137429,-0.787236,0.003072,-0.766413,0.010531,0.335004,-0.694875,0.0,0.663253,-0.071247,0.925718,-0.177360,-0.987780,0.000000,0.000000,0.001755,-1.047528,0.001316,0.010970,0.001755,0.000439,0.000000,0.000000,0.00000,0.00000,-0.723658,0.000000,0.000000,0.007021,0.000878,0.673371,0.000096,0.000013,0.000439,0.000015,0.000091,-1.085417,0.000089,0.000018,0.000408,0.000004,-0.438348,-0.438348,0.000174,0.158088,0
14121,-0.785862,0.061965,0.146209,0.439628,-0.573590,1.022723,0.024368,0.018798,0.000696,0.043863,0.792049,0.093788,1.054184,0.867849,0.038293,0.038989,0.013925,0.003481,0.956459,0.817640,1.013547,-0.263965,0.002089,-1.047096,-1.507369,0.004874,-0.766413,0.016710,0.335004,1.101194,0.0,0.461380,-0.573106,-0.138534,0.872885,-1.147989,0.012532,0.000000,0.002785,-1.047528,0.002089,0.019495,0.009747,0.000696,0.000000,0.000000,0.00000,0.00000,1.381868,0.000000,0.000696,0.014621,0.001392,0.673371,0.000146,0.000107,0.000580,0.000016,0.000140,0.921039,0.000265,0.000070,0.000474,0.000029,-0.695539,-0.695539,0.000184,0.128676,0


In [36]:
train.to_csv('train2.csv',index=False)

# Transformar os dados de teste

In [37]:
test['var2'] = test['var2'].replace([-999],0)
test['var3'] = test['var3'].replace([-999],0)
test['var4'] = test['var4'].replace([-999],0)
test['var6'] = test['var6'].replace([-999],0)
test['var7'] = test['var7'].replace([-999],0)
test['var8'] = test['var8'].replace([-999],0)
test['var9'] = test['var9'].replace([-999],0)
test['var10'] = test['var10'].replace([-999],0)
test['var11'] = test['var11'].replace([-999],0)
test['var12'] = test['var12'].replace([-999],0)
test['var15'] = test['var15'].replace([-999],0)
test['var16'] = test['var16'].replace([-999],0)
test['var17'] = test['var17'].replace([-999],0)
test['var18'] = test['var18'].replace([-999],0)
test['var26'] = test['var26'].replace([-999],0)
test['var52'] = test['var52'].replace([-999],0)
test['var67'] = test['var67'].replace([-999],0)
test['var68'] = test['var68'].replace([-999],0)

In [38]:
test.to_csv('test1.csv',index=False)

In [39]:
# Normalizer - 
norm= Normalizer()

test[['var2', 'var3', 'var7', 'var8', 'var9', 'var10', 'var15', 'var16', 'var17', 'var18', 'var23', 'var26', 'var28', 'var31', 'var37', 'var38', 'var39', 'var41', 'var42', 'var43', 'var44', 'var45', 'var46', 'var47', 'var48', 'var50', 'var51', 'var52', 'var53', 'var55', 'var56', 'var57', 'var58', 'var59', 'var61', 'var62', 'var63', 'var64', 'var65', 'var66', 'var67']]= norm.fit_transform(test[['var2', 'var3', 'var7', 'var8', 'var9', 'var10', 'var15', 'var16', 'var17', 'var18', 'var23', 'var26', 'var28', 'var31', 'var37', 'var38', 'var39', 'var41', 'var42', 'var43', 'var44', 'var45', 'var46', 'var47', 'var48', 'var50', 'var51', 'var52', 'var53', 'var55', 'var56', 'var57', 'var58', 'var59', 'var61', 'var62', 'var63', 'var64', 'var65', 'var66', 'var67']])

scaler = StandardScaler()
test[['var1', 'var4', 'var5', 'var6', 'var11', 'var12', 'var13', 'var14', 'var19', 'var20', 'var21', 'var22', 'var24', 'var25', 'var27', 'var29', 'var30', 'var32', 'var33', 'var34', 'var35', 'var36', 'var40', 'var49', 'var54', 'var60']]=scaler.fit_transform(test[['var1', 'var4', 'var5', 'var6', 'var11', 'var12', 'var13', 'var14', 'var19', 'var20', 'var21', 'var22', 'var24', 'var25', 'var27', 'var29', 'var30', 'var32', 'var33', 'var34', 'var35', 'var36', 'var40', 'var49', 'var54', 'var60']])



In [40]:
test.to_csv('test2.csv',index=False)

Executing shutdown due to inactivity...


2021-10-12 20:13:38,990 - INFO     - Executing shutdown due to inactivity...


Executing shutdown...


2021-10-12 20:13:39,017 - INFO     - Executing shutdown...
