In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import plot_confusion_matrix

## Loading data

In [2]:
data = pd.read_csv("../dataset/dataset_4.csv", sep = ";").drop("Unnamed: 0", 1)
data.describe()

Unnamed: 0,TP_ESCOLA,TP_LINGUA,Escolaridade_Pai,Escolaridade_Mae,Ocupacao_Pai,Ocupacao_Mae,Renda_Mensal,Banheiros,Freezer,Computador,DESEMPENHO_BAIXO,Renda_per_capta,Pessoas_por_computador,Celulares_por_pessoa
count,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0,3168898.0
mean,0.4491498,0.5271628,3.439471,4.073692,2.296138,2.226948,3.49658,1.369465,0.3874868,0.688202,0.5000047,834.6541,1.81941,42.73131
std,0.6148688,0.4992617,1.873869,1.719509,1.39276,1.279828,3.254036,0.7059705,0.5448613,0.7721223,0.5000001,1252.211,1.9868,9.523057
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,299.4,0.0,43.0
50%,0.0,1.0,3.0,5.0,2.0,2.0,2.0,1.0,0.0,1.0,1.0,499.0,1.333333,46.0
75%,1.0,1.0,5.0,5.0,3.0,3.0,5.0,2.0,1.0,1.0,1.0,831.6667,3.0,49.0
max,2.0,1.0,7.0,7.0,5.0,5.0,16.0,4.0,4.0,4.0,1.0,30000.0,20.0,54.0


### Standarizing values

In [3]:
data_std = data
for col in data_std.columns:
    mean = data_std[col].mean()
    std = data_std[col].std()
    data_std[col] = (data_std[col]-mean)/std

## Clusters

### Using the Elbow Method to find the best number of clusters

In [None]:
distortions = []
K = range(2,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(data_std.drop(["DESEMPENHO_BAIXO"], 1))
    distortions.append(kmeanModel.inertia_)
plt.figure(figsize=(15,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Número de Clusters')
plt.ylabel('Distorção')
plt.show()

In [None]:
# Applying K-Means
n = 3
kmeans = KMeans(n_clusters = n).fit(data_std.drop(["DESEMPENHO_BAIXO"], 1))
labels = kmeans.labels_

In [None]:
# Applying PCA to reduce dimentions for plotting purposes
    
pca = PCA(n_components = 2).fit(np.array(data_std.drop(["DESEMPENHO_BAIXO"], 1)))
pca_2d = pca.transform(np.array(data_std.drop(["DESEMPENHO_BAIXO"], 1)))
centers = pca.transform(kmeans.cluster_centers_)
print("Explained variance: " + str(round(100*sum(pca.explained_variance_ratio_), 2)) + "%")

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(x=pca_2d[:, 0], y=pca_2d[:, 1], hue=labels, cmap = "mako")
plt.legend(["Cluster 0", "Cluster 1", "Cluster 2"])

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(x=pca_2d[:, 0], y=pca_2d[:, 1], hue=data["DESEMPENHO_BAIXO"])
plt.legend(["Candidatos abaixo da média", "Candidatos acima da média"])

### Removing the intermediate cluster

In [None]:
data["Cluster"] = labels
data = data[data["Cluster"]!=0]
data = data.drop("Cluster", 1)
train, test = train_test_split(data, test_size=0.30)

In [None]:
weight = len(train[train["DESEMPENHO_BAIXO"] == 1])/len(train[train["DESEMPENHO_BAIXO"] == 0])

In [None]:
random_grid = {'max_depth': [5, 8, 10, 12, 15, 20],
               'min_samples_split': [5, 10, 20, 40, 100, 200, 500, 1000],
               'min_samples_leaf': [30, 40, 60, 80, 150, 300, 500, 1000],
               'scale_pos_weight': weight
                }

rf = DecisionTreeClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               random_state=42, n_jobs = -1)
rf_random.fit(train.drop("DESEMPENHO_BAIXO", 1), train["DESEMPENHO_BAIXO"])
rf_random.best_params_