# Notebook - Análise da evasão

Dados do Instituto de Ciências Exatas (IE).

### Importando bibliotecas:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

import utils
import pre_process

%matplotlib inline

In [None]:
import importlib
importlib.reload(utils)

 ### Leitura dos dados

In [2]:
DATA_FILE = '../data/ie_data.csv'
data_pre = pd.read_csv(DATA_FILE, sep=';', low_memory=False)

## Pré-Processamento

In [3]:
attrs = []
data_pre = pre_process.format_data(data_pre)
data_pre = pre_process.erase_attr(data_pre)
data_pre = data_pre.dropna()

data_pre = pre_process.public_school(data_pre, attrs)
data_pre = pre_process.dropout(data_pre, attrs)
data_pre = pre_process.course(data_pre, attrs)
data_pre = pre_process.gender(data_pre, attrs)
data_pre = pre_process.quota(data_pre, attrs)
data_pre = pre_process.entry(data_pre, attrs)

data_pre = pre_process.cic_courses(data_pre)
data_pre = pre_process.erase_interal_transfer_students(data_pre)

Divide dataframe by course

In [4]:
data_course = pre_process.divide_course(data_pre)
data_course['all'] = data_pre

### Frequência das disciplinas

In [None]:
# for course, data in data_course.items():
#     print('\n')
#     print(course)
#     print(data['nome_disciplina'].value_counts()[:20])

### Process Subjects and CEP

The Subjects are processed for each course individually, and Cep needs to be processed before all other attributes and drop_duplicates, otherwise it takes too long to process.

In [5]:
keys = data_course.keys()
# keys = ['ciência da computação']
data_analysis = {}
for course in keys:
    print(course)
    data = data_course[course].copy()
    attrs_copy = attrs.copy()

    # Process subjects for each 
    data = pre_process.subjects(data, attrs_copy)

    # Cep needs to be processed before drop_duplicates,
    # otherwise it takes too long to process.
    data = data[attrs_copy+['cep', 'aluno']].drop_duplicates()
    data = pre_process.cep(data, attrs_copy)
    data_analysis[course] = data.copy()
    data = data[attrs_copy].drop_duplicates()
    data_course[course] = data.copy()

ciência da computação
computação
engenharia de computação
engenharia mecatrônica
all


In [6]:
for course in data_course.keys():
    print(f'{course} = {data_course[course].shape}')

ciência da computação = (1536, 32)
computação = (1228, 32)
engenharia de computação = (512, 32)
engenharia mecatrônica = (1026, 32)
all = (4235, 32)


In [None]:
data_process = data_course.copy()

Desbalanceamento da base:

In [None]:
# data_process['ciência da computação']['dropout'].value_counts()

## Processamento

In [None]:
def process(data):
    output_attr = 'dropout'
    cat_attr = ['course', 'entry']

    X = data.drop(columns=[output_attr])
    y = data[output_attr]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    model = CatBoostClassifier()
    model.fit(X_train, y_train, cat_features=cat_attr, plot=False, silent=True)

    predicts = model.predict(X_test)
    predicts = [x == 'True' for x in predicts]
    print('Accuracy score:', accuracy_score(y_test, predicts))
    print('Recall score:', recall_score(y_test, predicts))
    print('Precision score:', precision_score(y_test, predicts))

    utils.plot_feature_importance(model.get_feature_importance(), X.columns, 'Catboost')
    return model, X_test

Process all dataframes:

In [None]:
for course, data in data_process.items():
    print(f'\nCurso = {course}')
    model, X_test = process(data)

Process a single dataframe:

In [None]:
course = 'computação'
model, X_test = process(data_process[course])

## Using SHAP

In [None]:
import shap
# def shap_plot(model, X_test):
explainer = shap.TreeExplainer(model)
start_index = 0
end_index = 500
shap_values = explainer.shap_values(X_test[start_index:end_index])

shap.initjs()

In [None]:
id = 10
shap.force_plot(explainer.expected_value,
                shap_values[id:id+1],
                X_test[id:id+1])

In [None]:
shap.force_plot(explainer.expected_value,
                shap_values,
                X_test[start_index:end_index])

In [None]:
shap.summary_plot(shap_values, X[start_index:end_index], plot_size=(20,20))

In [None]:
shap.dependence_plot('programming_subject', shap_values, X_test[start_index:end_index], interaction_index='distance')

In [None]:
shap.summary_plot(shap_values, X_test[start_index:end_index])

## Plot Student's House Coordinates

In [None]:
attr = ['aluno', 'cep', 'dropout', 'course']
# keys = data_analysis.keys()
keys = ['ciência da computação']
for course in keys:
    data = data_analysis[course].copy()[attr].drop_duplicates()
    utils.plot_coordinates(data, course)
    utils.plot_coordinates_density(data, course)