In [None]:
#import libraries
#The "TARGET" column is the variable to predict. It equals 1 for unsatisfied customers and 0 for satisfied customers.

#Models
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

#Transformation
import numpy as np
import pandas as pd
import random

#Graphics
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Loading the train Data set
df_train = pd.read_csv("DataSet/train.csv")
df_test = pd.read_csv("DataSet/Test.csv")
Y_test = pd.read_csv("DataSet/sample_submission.csv")
df_train.head()

In [None]:
#Basic statistics
df_train.describe()


In [None]:
#76020 variables and 371 observations. 
df_train.shape

In [None]:
#Check types for all collumns - Only Int and Float
dataTypeSeries = set(df_train.dtypes)
dataTypeSeries

In [None]:
#Checking the balance of the dataSET, as indicated, only 3% of my Dataset has the classification of satisfied client.

df=pd.DataFrame(df_train["ID"].groupby(df_train["TARGET"]).count())
Num_sat=int(df.iloc[1])

df_train["TARGET"].mean()


In [None]:
#Moving ID to index, this way the column ID will be "removed" from the DataFrame
df_train.index=df_train["ID"]
df_train.drop("ID", axis=1, inplace=True)

In [None]:
df_train.shape

In [None]:
#Lista dos index com clientes satisfeitos e insatisfeitos
Index_Sat=np.array(df_train.loc[df_train.TARGET>0].index)
Index_InSat=np.array(df_train.loc[df_train.TARGET==0].index)
df_train.loc[Index_InSat].head()

In [None]:
#Gerando uma lista de itens randomicos com index das pessoas insatisfeitas para que temos as mesmas observações de 
#pessoas satisfeitas e insatisfeitas
random_Index_Inst=random.sample(list(Index_InSat),Num_sat)
df_train.loc[random_Index_Inst].head(5)

In [None]:
#Now the Dataset has the same number of observations for 1 and 0.
List_dataset = [df_train.loc[Index_Sat],df_train.loc[random_Index_Inst]]
New_data_train = pd.concat(List_dataset)
New_data_train[["var3", "TARGET"]].groupby("TARGET").count()

In [None]:
# Now the median of observation for the classification is 50%
New_data_train["TARGET"].mean()

In [None]:
#Separado em dados em X, Y de treino e de dados de test
#Dataset Treino
Y_treino = New_data_train["TARGET"].values
New_data_train.drop("TARGET", axis=1, inplace=True)
X_treino = New_data_train
print(X_treino.shape, Y_treino.shape)


In [None]:
#The test Dataset in this project was given with already splited in X and Y.
#Here we will drop the column ID

#Mudanças para X_Test
df_test.index = df_test["ID"]
df_test.drop("ID", axis=1, inplace=True)
X_test = df_test
#Mudanças para Y_test
Y_test.drop("ID", axis=1, inplace=True)
Y_test=Y_test["TARGET"].values


In [None]:
print(X_treino.shape, Y_treino.shape,X_test.shape, Y_test.shape)

# Redução da dimensionalidade

In [None]:
#Utilizando o PCA para reduzir a dimensionalidade
pca = decomposition.PCA(n_components = 100, 
                        whiten = True, 
                        svd_solver = 'randomized')

In [None]:
# O PCA é um algoritmo de aprendizagem não supervisionada que necessita de padronização dos dados
# Padronizando os dados
standardization = StandardScaler()
Stand_coef_linear_reg = make_pipeline(standardization, pca)
pca.fit(X_treino)


In [None]:
# E então aplicamos o modelo PCA nos dados de treino e de teste
X_treino_pca = pca.transform(X_treino)
X_test_pca = pca.transform(X_test)

In [None]:
# Shape
print(X_treino_pca.shape)
print(X_test_pca.shape)

# Criação dos Modelos 

SVM

In [None]:
#ML com SVM
# Cria o modelo
modelo_svm = svm.SVC(C = 5., gamma = 0.001)
# Treinamento do modelo
modelo_svm.fit(X_treino_pca, Y_treino)

In [None]:
#Previsões
previsoes = modelo_svm.predict(X_test_pca)

In [None]:
print(modelo_svm.score(X_test_pca, Y_test))

In [None]:
#Confusion Matriz
matrix = confusion_matrix(Y_test, previsoes)

# Imprimindo a Confusion Matrix
print(matrix)

In [None]:
X_treino.shape

# Regressão logistica

In [None]:
# Import do módulo
from sklearn import tree

# Datasets de treino e de teste
x_treino = X_treino_pca
y_treino = Y_treino
x_teste = X_test_pca   

# Criando o objeto tree para regressão
modelo = tree.DecisionTreeRegressor() 

# Criando o objeto tree para classificação
modelo = tree.DecisionTreeClassifier() 

# Treinando o modelo com dados de treino e checando o score
modelo.fit(x_treino, Y_treino)
modelo.score(x_treino, Y_treino)

# Previsões
valores_previstos = modelo.predict(x_teste)

In [None]:
matrix = confusion_matrix(Y_test, valores_previstos)

# Imprimindo a Confusion Matrix
print(matrix)

In [None]:
modelo.score(X_test_pca, Y_test)