# Notebook - Análise da evasão

Dados do Instituto de Ciências Exatas (IE).

Importando bibliotecas:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.metrics
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

import pre_process
import utils

%matplotlib inline

 ### Leitura dos dados

In [2]:
import pre_process

DATA_FILE = '../data/ie_data.csv'
data_0 = pd.read_csv(DATA_FILE, sep=';', low_memory=False)

## Pré-Processamento

In [3]:
# Pre-Process
columns = ['cep']
data_0 = pre_process.format_data(data_0)
data_0 = pre_process.public_school(data_0, columns)
data_0 = pre_process.credits(data_0, columns)
data_0 = pre_process.dropout(data_0, columns)
data_0 = pre_process.course(data_0, columns)
data_0 = pre_process.gender(data_0, columns)
data_0 = pre_process.quota(data_0, columns)
data_0 = pre_process.entry(data_0, columns)
# data_0 = pre_process.cep(data_0, columns)

# data_0 = pre_process.ira(data_0, columns)
# data_0 = pre_process.programming_subjects(data_0, columns)

In [None]:
data_1 = data_0.copy()[columns].drop_duplicates()

In [None]:
data_course = pre_process.divide_course(data_1)

## Testes Mecatrônica

In [4]:
data_course = pre_process.divide_course(data_0)

In [5]:
meca = data_course['engenharia mecatrônica']

In [6]:
meca = pre_process.subjects(meca, columns)

(15365, 39)
(13396, 39)
(456, 52)


In [None]:
data_1 = meca.copy()[columns].drop_duplicates()

In [None]:
data_1 = pre_process.cep(data_1, columns)

In [None]:
data_1.head()

## Processamento

In [None]:
attr = 'dropout'
X = data_1.drop(columns=[attr])
y = data_1[attr]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
model = CatBoostClassifier()
model.fit(X_train, y_train, cat_features=['course', 'entry'], plot=True)

In [None]:
predictions = model.predict(X_test)
predictions = [x == 'True' for x in predictions]
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Recall score:", sklearn.metrics.recall_score(y_test, predictions))
print("Precusion score:", sklearn.metrics.precision_score(y_test, predictions))

In [None]:
def plot_feature_importance(importance,names,model_type):
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

plot_feature_importance(model.get_feature_importance(), X.columns,'CATBOOST')

## Using SHAP

In [None]:
import shap
explainer = shap.TreeExplainer(model)
start_index = 0
end_index = 1000
shap_values = explainer.shap_values(X_test[start_index:end_index])

In [None]:
shap.initjs()

In [None]:
id = 4
shap.force_plot(explainer.expected_value,
                shap_values[id:id+1],
                X_test[id:id+1])

In [None]:
shap.force_plot(explainer.expected_value,
                shap_values,
                X_test[start_index:end_index])

In [None]:
shap.summary_plot(shap_values, X[start_index:end_index], plot_size=(20,20))

In [None]:
shap.dependence_plot('programming_subject', shap_values, X_test[start_index:end_index], interaction_index='distance')

In [None]:
shap.summary_plot(shap_values, X_test[start_index:end_index])

## Plot Student's House Coordinates

In [None]:
attr = ['aluno', 'cep', 'dropout', 'course']
data_map = data_0.copy()[attr].drop_duplicates()
data_map_course = pre_process.divide_course(data_map)

utils.plot_coordinates(data_map, 'all courses')
for course, data_course in data_map_course.items():
    utils.plot_coordinates(data_course, course)

In [None]:
x = []
y = []
c = []
coordinate_json = utils.read_json('../data/coordinate.json')
attr = 'cep'

for index, row in data_map.iterrows():
    cep = str(row[attr])
    info = coordinate_json[cep]
    if info['lat'] < -17:
        data_map.drop(index, inplace=True)
        continue
    x.append(info['lat'])
    y.append(-info['lng'])
    c.append('red' if row['dropout'] else 'blue')

In [None]:
sns.displot(data_map, x=x, y=y, hue='dropout', height=6)

In [None]:
p = sns.jointplot(data=data_map,x=x, y=y,kind='kde')

In [None]:
sns.kdeplot(x=x,y=y,hue=c,
            shade=True,
            thresh=False
)