# Atividade Final - ML

#### Gerson Vasconcelos

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer


In [2]:
# !wget http://www.vincentlemaire-labs.fr/kddcup2009/orange_small_train.data.zip
# !wget http://www.vincentlemaire-labs.fr/kddcup2009/orange_small_train_churn.labels

In [3]:
# !unzip orange_small_train.data.zip
# !unzip orange_small_train_churn.labels

In [4]:
y = pd.read_csv("orange_small_train_churn.labels", header=None)
X = pd.read_csv("orange_small_train.data", sep = "\t")

#### Exploração e pré processamento

In [5]:
# Dimensões
X.shape

(50000, 230)

In [6]:
y.shape

(50000, 1)

In [7]:
# Mudando para binário
y[y[0] == -1] = 0

# Valores das labels
y[0].unique()

array([0, 1])

In [8]:
# Será que os labels estão balanceados?
(y[0].value_counts()/len(y[0]))* 100

0    92.656
1     7.344
Name: 0, dtype: float64

In [9]:
# Checando os tipos de variáveis no X
X.dtypes.value_counts()

float64    191
object      38
int64        1
dtype: int64

In [10]:
# Vamos dropar as features com mais de 35% de valores faltantes

faltantes = pd.DataFrame(X.isnull().sum()/X.shape[0]) * 100
faltantes.reset_index(level=0, inplace=True)
faltantes.columns = ['features','percentual']

faltantes = faltantes.query('percentual >= 35')

features_drop = [nomes for nomes in faltantes.features]

X.drop(features_drop, axis = 1, inplace=True)

In [11]:
# Como ficou?
X.shape

(50000, 67)

In [12]:
X.dtypes.value_counts()

float64    38
object     28
int64       1
dtype: int64

In [13]:
# Vamos checar a variância dessas features
X.var()

Var6      7.212950e+06
Var7      4.001895e+01
Var13     7.811773e+06
Var21     3.198583e+05
Var22     4.963061e+05
Var24     9.858144e+01
Var25     4.593233e+04
Var28     9.706238e+03
Var35     8.976059e+00
Var38     9.060560e+12
Var44     2.654762e+00
Var57     4.104197e+00
Var65     1.027069e+02
Var73     2.794092e+03
Var74     5.878473e+05
Var76     3.436179e+12
Var78     4.559759e+00
Var81     1.129377e+10
Var83     7.766802e+03
Var85     4.252044e+02
Var109    1.995134e+04
Var112    2.484964e+04
Var113    5.796888e+11
Var119    4.689101e+06
Var123    4.908498e+04
Var125    8.123125e+09
Var126    5.076953e+02
Var132    9.989759e+01
Var133    5.946768e+12
Var134    3.652286e+11
Var140    1.592417e+07
Var143    4.135272e-01
Var144    1.373652e+02
Var149    4.315105e+11
Var153    1.891316e+13
Var160    9.899683e+03
Var163    7.205698e+11
Var173    1.755705e-02
Var181    6.228425e+00
dtype: float64

In [14]:
# vimos que a variável 173 e a 143 tem uma variância muito baixa, vamos dropá-las também então
X.drop(['Var143','Var173'],
      axis =1,
      inplace=True)

In [15]:
# Vamos ver quantas classes cada categorica tem
categoricas = X.select_dtypes(exclude=["number"])
categoricas = pd.DataFrame(categoricas.nunique())
categoricas.reset_index(level=0, inplace=True)
categoricas.columns = ['features','num_categorias']
categoricas

Unnamed: 0,features,num_categorias
0,Var192,361
1,Var193,51
2,Var195,23
3,Var196,4
4,Var197,225
5,Var198,4291
6,Var199,5073
7,Var202,5713
8,Var203,5
9,Var204,100


In [16]:
# Vamos ver quais as 10 categorias mais frequentes nas categoricas com mais de 10 categorias e utilizar apenas essas

def transformar(X, features):
    for feature in features:
        try:
            ajuste = pd.DataFrame(X[feature].value_counts())
            ajuste.reset_index(level=0, inplace=True)
            ajuste.columns = ['features','num']
            ajuste.reset_index(level=0, inplace=True)
            ajuste = list(ajuste.query('index > 9').features)
            
            for categoria in ajuste:
                X[feature].replace(categoria, 'Other', inplace = True)
                
        except:
            pass
            

transformar(X, categoricas.features)

In [17]:
# Agora vamos aplicar one hot encoding

X = pd.get_dummies(X)

In [18]:
# # Vamos processar agora os valores missing
# # Vamos usar um processo simples de imputar a média
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

X = imputer.fit_transform(X)

In [32]:
# Vamos dividir em treino e teste

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 3)


In [52]:
# Agora vamos balancear os dados

sm = SMOTE(random_state= 3)

X_res, y_res = sm.fit_resample(X_train, y_train.values.ravel())

In [54]:
# regressao logistica
import warnings
warnings.filterwarnings('ignore')

logreg = LogisticRegressionCV(cv = 10)
logreg.fit(X_res, y_res)
yhat_logreg = logreg.predict(X_test)

print('Acurárica: ' + str(accuracy_score(y_test, yhat_logreg)))
print('ROC_AUC: '+ str(roc_auc_score(y_test, yhat_logreg)))

Acurárica: 0.5610666666666667
ROC_AUC: 0.5585268252941318


In [55]:
# matriz de confusão 
confusion_matrix(y_test, yhat_logreg)

array([[7811, 6100],
       [ 484,  605]])

In [56]:
# naive bayes
nb = GaussianNB()
nb.fit(X_res, y_res)
yhat_nb = nb.predict(X_test)

print('Acurárica: ' + str(accuracy_score(y_test, yhat_nb)))
print('ROC_AUC: '+ str(roc_auc_score(y_test, yhat_nb)))

Acurárica: 0.1718
ROC_AUC: 0.5107402568829431


In [57]:
# matriz de confusão 
confusion_matrix(y_test, yhat_nb)

array([[ 1589, 12322],
       [  101,   988]])

In [58]:
# randomforests
rfc = RandomForestClassifier(random_state=3,
                            n_jobs = 4,
                            criterion='gini',
                            n_estimators=100,
                            verbose = False)
rfc.fit(X_res, y_res)
yhat_rfc = rfc.predict(X_test)

print('Acurárica desbalanceamento 1: ' + str(accuracy_score(y_test, yhat_rfc)))
print('ROC_AUC desbalanceamento 1: '+ str(roc_auc_score(y_test, yhat_rfc)))

Acurárica desbalanceamento 1: 0.9273333333333333
ROC_AUC desbalanceamento 1: 0.5003872512645818


In [59]:
# matriz de confusão 
confusion_matrix(y_test, yhat_rfc)

array([[13909,     2],
       [ 1088,     1]])

In [62]:
# LDA
clf = LinearDiscriminantAnalysis()
clf.fit(X_res, y_res)  
yhat_clf = clf.predict(X_test)

print('Acurárica desbalanceamento 1: ' + str(accuracy_score(y_test, yhat_clf)))
print('ROC_AUC desbalanceamento 1: '+ str(roc_auc_score(y_test, yhat_clf)))

Acurárica desbalanceamento 1: 0.6387333333333334
ROC_AUC desbalanceamento 1: 0.6101336259451812


In [63]:
# matriz de confusão 
confusion_matrix(y_test, yhat_clf)

array([[8953, 4958],
       [ 461,  628]])