# Notebook - Análise da evasão

Dados do Instituto de Ciências Exatas (IE).

### Importando bibliotecas:

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv

import utils
import process
import pre_process

%matplotlib inline

In [2]:
# Reload module

import importlib
importlib.reload(process)

<module 'process' from '/home/tiago/git/trj-academica/pipeline/process.py'>

 ### Leitura dos dados

In [3]:
# SIGRA_BEFORE_2004 = '../data/new/sigra_ate_2004.csv'
# SIGRA = '../data/new/sigra_a_partir_de_2005.csv'
# SIGAA     = '../data/new/sigaa.csv'

# data_sigra_before_2004 = pd.read_csv(SIGRA_BEFORE_2004, sep=';', low_memory=False)
# data_sigra = pd.read_csv(SIGRA, sep=';', low_memory=False)
# data_sigaa = pd.read_csv(SIGAA, sep=';', low_memory=False)

# data_sigra_before_2004 = pre_process.map_columns(data_sigra_before_2004)
# data_sigra = pre_process.map_columns(data_sigra)
# data_sigaa = pre_process.map_columns(data_sigaa)

# data_pre = pd.concat([data_sigaa, data_sigra, data_sigra_before_2004])

In [4]:
OLD = '../data/old/ie_data.csv'
data_pre = pd.read_csv(OLD, sep=';', low_memory=False)

## Pré-Processamento

In [5]:
attrs = []
attrs_cat = []
data_pre = pre_process.format_data(data_pre)

data_pre = pre_process.dataframe_specific_adjustments(data_pre)
data_pre = pre_process.erase_attr(data_pre)
data_pre = pre_process.remove_nan(data_pre)

year_range = [2004, 2019]
data_pre = pre_process.time_frame(data_pre, year_range)

horizon = 1  # 1 year = 2 semesters
data_pre = pre_process.beyond_horizon(data_pre, horizon)

data_pre = pre_process.dropout(data_pre, attrs)
data_pre = pre_process.dropout_before_horizon(data_pre, horizon)
data_pre = pre_process.course(data_pre, attrs)
data_pre = pre_process.public_school(data_pre, attrs)
data_pre = pre_process.gender(data_pre, attrs)
data_pre = pre_process.entry(data_pre, attrs, attrs_cat)
data_pre = pre_process.age(data_pre, attrs)
data_pre = pre_process.quota(data_pre, attrs, attrs_cat)

data_pre = pre_process.cic_courses(data_pre)
data_pre = pre_process.erase_internal_transfer_students(data_pre)

# data_pre = pre_process.one_hot_encoding(data_pre, ['quota_type', 'entry'], attrs, attrs_cat)
data_pre = data_pre.dropna()

credits_dict = pre_process.subject_credits(data_pre)

In [6]:

horizon = 1  # 1 year = 2 semesters
data_pre = pre_process.beyond_horizon(data_pre, horizon)

Divide dataframe by course:

In [7]:
data_course, attrs_course, attrs_cat_course = pre_process.divide_course(
    data_pre, attrs, attrs_cat
)

data_course['all'] = data_pre.copy()
attrs_course['all'] = attrs.copy()
attrs_cat_course['all'] = attrs_cat.copy()
attrs_cat_course['all'].append('course')

### Process Subjects and CEP

The Subjects are processed for each course individually, and Cep needs to be processed before all other attributes and drop_duplicates, otherwise it takes too long to process.

In [8]:
keys = data_course.keys()
for course in keys:
    print(course)
    data = data_course[course].copy()
    attrs = attrs_course[course]

    # Process subjects
    data = pre_process.add_semester_prefix(data)
    data = pre_process.subjects(data, attrs, horizon, credits_dict)

    data = pre_process.remove_anomalies(data)

    # Process CEP, it needs to be processed before drop_duplicates, or it takes too long to process.
    data = data[attrs+['cep', 'aluno']].drop_duplicates()
    data = pre_process.cep(data, attrs)

    data_course[course] = data.copy()

ciência_da_computação
computação
engenharia_de_computação
engenharia_mecatrônica
all


In [9]:
data_process = data_course.copy()

## Analysis

Cotas:

In [10]:
# for course in data_course.keys():
#     print(f'{course} = {data_course[course].shape}')
#     print(f'quota \n{data_course[course]["quota"].value_counts()}')
#     print('\n')

Desbalanceamento da base:

In [11]:
for course in data_course.keys():
    print(f'{course}:')
    freq = data_course[course]['dropout'].value_counts()
    print(freq)
    print('\n')

ciência_da_computação:
True     589
False    339
Name: dropout, dtype: int64


computação:
True     677
False    218
Name: dropout, dtype: int64


engenharia_de_computação:
True     345
False    191
Name: dropout, dtype: int64


engenharia_mecatrônica:
True     359
False    324
Name: dropout, dtype: int64


all:
True     1970
False    1072
Name: dropout, dtype: int64




In [12]:
# import importlib
# importlib.reload(utils)
# utils.double_bar_graph(data_course['all'], 'age', 'dropout', size=(15, 20))
# utils.double_bar_graph(data_course['all'], 'quota', 'dropout')
# utils.double_bar_graph(data_course['all'], 'public_school', 'dropout')
# utils.double_bar_graph(data_course['all'], 'entry', 'dropout')
# utils.double_bar_graph(data_course['all'], '2_total_credits', 'dropout', ratio=False, size=(10, 13))
# utils.double_bar_graph(data_course['all'], '1_algoritmos_e_programacao_de_computadores', 'dropout', size=(10, 13))

In [13]:
data_course['all'].columns

Index(['dropout', 'course', 'public_school', 'female', 'entry', 'age', 'quota',
       'quota_type', '1_relative_credits', '1_absolute_credits',
       '2_relative_credits', '2_absolute_credits', '1_ira', '2_ira',
       '1_algoritmos_e_programacao_de_computadores', '1_calculo_1',
       '1_fisica_1', '1_fisica_1_experimental', '2_calculo_2',
       '2_probabilidade_e_estatistica', '1_ingles_instrumental_1',
       '2_algoritmos_e_programacao_de_computadores', '2_fisica_2',
       '2_fisica_2_experimental', '2_estruturas_de_dados',
       '2_introducao_a_algebra_linear', '1_organizacao_da_educacao_brasileira',
       '2_calculo_1', '1_leitura_e_producao_de_textos',
       '1_introducao_a_engenharia_mecatronica',
       '1_introducao_a_microinformatica', '1_logica_computacional_1',
       '1_quimica_geral_teorica', '2_fisica_1', '1_quimica_geral_experimental',
       '1_portugues_instrumental_1', '1_introducao_a_engenharia_de_computacao',
       '2_desenho_mecanico_assistido_por_computa

## Testes

In [14]:
data_test = data_course['all']

## Processamento

In [35]:
def catboost(data, attrs, attrs_cat, course):
    output_attr = 'dropout'

    X = data[attrs].drop(columns=[output_attr])
    y = data[output_attr]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )   
    train_data = Pool(data=X_train, label=y_train, cat_features=attrs_cat)

    # dropout imbalance
    output_freq = data[output_attr].value_counts()
    scale = output_freq[0] / output_freq[1]  # negative / positive

    model = CatBoostClassifier(scale_pos_weight=scale, logging_level='Silent')

    # model.fit(
    #     X=train_data, learning_rate=0.01
    #     cat_features=attrs_cat, plot=False, silent=True
    # )

    grid = {
        'learning_rate': [0.01],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7]
    }

    gs = model.randomized_search(
        grid,
        X=train_data,
        plot=False,
        cv=5,
        verbose=False,
        n_iter=8,
        shuffle=True,
        stratified=True,
    )

    process.log_metrics(model, X_train, y_train, X_test, y_test, course)
    process.log_params(gs['params'])
    # utils.precision_recall_graph(model, X_test, y_test)

    # shap importance
    shap_values = model.get_feature_importance(
        Pool(X_test, label=y_test, cat_features=attrs_cat), 
        type="ShapValues"
    )
    shap_values = shap_values[:,:-1]
    shap.summary_plot(
        shap_values, X_test, 
        # plot_type='bar', 
        plot_size=(15,15),
        max_display=30,
        show=False
    )

    # Log summary_plot
    plt.savefig(f'results/summary_plot_{course}.png', dpi=200, bbox_inches='tight', facecolor='#ffffff')
    with open('results/README.md', 'a+') as f:
        f.write(f'![summary_plot_{course}](summary_plot_{course}.png)\n')
    plt.clf()

    shap.summary_plot(
        shap_values, X_test, 
        plot_type='bar', 
        plot_size=(15,15),
        max_display=30,
        show=False
    )
    plt.savefig(f'results/bar_plot_{course}.png', dpi=200, bbox_inches='tight', facecolor='#ffffff')
    plt.clf()

    return model, X_test

In [36]:
# Clear the temporary results directory
for f in os.listdir('results'):
    os.remove(os.path.join('results', f))

Process all dataframes:

In [37]:
for course, data in data_process.items():
    print(f'Curso = {course}')
    model, X_test = catboost(
        data,
        attrs_course[course],
        attrs_cat_course[course],
        course
    )

Curso = ciência_da_computação
Curso = computação
Curso = engenharia_de_computação
Curso = engenharia_mecatrônica
Curso = all


<Figure size 1080x1080 with 0 Axes>

Process a single dataframe:

In [34]:
course = 'all'
model, X_test = catboost(
        data_process[course],
        attrs_course[course],
        attrs_cat_course[course],
        course
    )

<Figure size 1080x1080 with 0 Axes>

## Using SHAP

In [19]:
# explainer = shap.TreeExplainer(model)
# start_index = 0
# end_index = len(X_test)
# shap_values = explainer.shap_values(X_test[start_index:end_index])

# shap.initjs()

In [20]:
# id = 15
# shap.force_plot(explainer.expected_value,
#                 shap_values[id:id+1],
#                 X_test[id:id+1],
#                 matplotlib = True,
#                 show = False)

# plt.savefig(f'results/explainer{id}.png', dpi=200, bbox_inches='tight', facecolor='#ffffff')

In [21]:
# shap.force_plot(explainer.expected_value,
#                 shap_values,
#                 X_test[start_index:end_index])

In [22]:
# shap.summary_plot(shap_values, X_test[start_index:end_index], plot_size=(20,20), max_display=50)

In [23]:
# shap.summary_plot(shap_values, X_test, plot_type='bar', max_display=50)

## Plot Student's House Coordinates

In [24]:
# attr = ['aluno', 'cep', 'dropout']
# # keys = data_course.keys()
# keys = ['all']
# for course in keys:
#     print(course)
#     data = data_course[course].copy()[attr].drop_duplicates()
#     utils.plot_coordinates(data, course)
# #     utils.plot_coordinates_density(data, course)