# Notebook - Análise da evasão

Dados do Instituto de Ciências Exatas (IE).

### Importando bibliotecas:

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv

import utils
import process
import pre_process

%matplotlib inline

In [2]:
# Reload module

# import importlib
# importlib.reload(utils)

 ### Leitura dos dados

In [3]:
# DATA_FILE = '../data/ie_data.csv'
DATA_FILE = '../data/test_data.csv'
data_pre = pd.read_csv(DATA_FILE, sep=';', low_memory=False)

## Pré-Processamento

In [4]:
attrs = []
attrs_cat = []
data_pre = pre_process.format_data(data_pre)

data_pre = pre_process.dataframe_specific_adjustments(data_pre)
data_pre = pre_process.erase_attr(data_pre)
data_pre = pre_process.remove_nan(data_pre)

year_range = [1990, 2019]
data_pre = pre_process.time_frame(data_pre, year_range)

horizon = 1  # 1 year = 2 semesters
data_pre = pre_process.beyond_horizon(data_pre, horizon)

data_pre = pre_process.dropout(data_pre, attrs)
data_pre = pre_process.dropout_before_horizon(data_pre, horizon)
data_pre = pre_process.course(data_pre, attrs)
data_pre = pre_process.public_school(data_pre, attrs)
data_pre = pre_process.gender(data_pre, attrs)
data_pre = pre_process.entry(data_pre, attrs, attrs_cat)
data_pre = pre_process.age(data_pre, attrs)
data_pre = pre_process.quota(data_pre, attrs, attrs_cat)

data_pre = pre_process.cic_courses(data_pre)
data_pre = pre_process.erase_internal_transfer_students(data_pre)

# data_pre = pre_process.one_hot_encoding(data_pre, ['quota_type', 'entry'], attrs, attrs_cat)
data_pre = data_pre.dropna()

credits_dict = pre_process.subject_credits(data_pre)

Divide dataframe by course:

In [5]:
data_course, attrs_course, attrs_cat_course = pre_process.divide_course(
    data_pre, attrs, attrs_cat
)

data_course['all'] = data_pre.copy()
attrs_course['all'] = attrs.copy()
attrs_cat_course['all'] = attrs_cat.copy()
attrs_cat_course['all'].append('course')

### Process Subjects and CEP

The Subjects are processed for each course individually, and Cep needs to be processed before all other attributes and drop_duplicates, otherwise it takes too long to process.

In [6]:
keys = data_course.keys()
for course in keys:
    print(course)
    data = data_course[course].copy()
    attrs = attrs_course[course]

    # Process subjects
    data = pre_process.add_semester_prefix(data)
    data = pre_process.subjects(data, attrs, horizon, credits_dict)

    data = pre_process.remove_anomalies(data)

    # Process CEP, it needs to be processed before drop_duplicates, or it takes too long to process.
    # data = data[attrs+['cep', 'aluno']].drop_duplicates()
    # data = pre_process.cep(data, attrs)

    data_course[course] = data.copy()

engenharia_mecatrônica
all


In [7]:
data_process = data_course.copy()

## Analysis

Cotas:

In [8]:
# for course in data_course.keys():
#     print(f'{course} = {data_course[course].shape}')
#     print(f'quota \n{data_course[course]["quota"].value_counts()}')
#     print('\n')

Desbalanceamento da base:

In [9]:
for course in data_course.keys():
    print(f'{course}:')
    freq = data_course[course]['dropout'].value_counts()
    print(freq)
    print('\n')

engenharia_mecatrônica:
True     1
False    1
Name: dropout, dtype: int64


all:
True     1
False    1
Name: dropout, dtype: int64




In [10]:
# utils.double_bar_graph(data_course['all'], 'age', 'dropout')

In [11]:
data_course['all'].columns

Index(['age', 'aluno', 'cep', 'course', 'dropout', 'entry', 'female',
       'periodo_ingresso_curso', 'periodo_ingresso_unb', 'periodo_saida_curso',
       'public_school', 'quota', 'quota_type',
       '1_algoritmos_e_programacao_de_computadores', '1_calculo_1',
       '1_fisica_1', '1_fisica_1_experimental', '1_geometria_descritiva',
       '1_introducao_a_engenharia_mecatronica', '1_quimica_geral_experimental',
       '1_quimica_geral_teorica', '2_calculo_2',
       '2_desenho_mecanico_assistido_por_computador_1', '2_desenho_tecnico',
       '2_estruturas_de_dados', '2_fisica_2', '2_fisica_2_experimental',
       '2_ingles_instrumental_1', '2_ingles_instrumental_2',
       '2_introducao_a_algebra_linear', '2_probabilidade_e_estatistica',
       '1_relative_credits', '1_absolute_credits', '1_total_credits',
       '2_relative_credits', '2_absolute_credits', '2_total_credits', '1_ira',
       '1_total', '2_ira', '2_total'],
      dtype='object', name='nome_disciplina')

## Testes

In [12]:
data_test = data_course['all']

## Processamento

In [13]:
def catboost(data, attrs, attrs_cat, course):
    output_attr = 'dropout'

    X = data[attrs].drop(columns=[output_attr])
    y = data[output_attr]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )   

    # dropout imbalance
    output_freq = data[output_attr].value_counts()
    scale = output_freq[0] / output_freq[1]  # negative / positive

    model = CatBoostClassifier(scale_pos_weight=scale, depth=5, one_hot_max_size=4)
    model.fit(
        X_train, y_train,
        cat_features=attrs_cat, plot=False, silent=True
    )

#     cv_dataset = Pool(
#         data=X,
#         label=y,
#         cat_features=attrs_cat
#     )

#     params = {
#         "iterations": 100,
#         "depth": 2,
#         "loss_function": "Logloss",
#         "verbose": False,
#         "depth": 5,
#         "scale_pos_weight": scale
#     }

#     scores = cv(
#         cv_dataset,
#         params,
#         fold_count=5, 
#         plot="True",
#         return_models="True"
#     )[4]
    
#     print(scores)


    process.log_metrics(model, X_train, y_train, X_test, y_test, course)
    # utils.precision_recall_graph(model, X_test, y_test)

    # shap importance
    shap_values = model.get_feature_importance(
        Pool(X_test, label=y_test, cat_features=attrs_cat), 
        type="ShapValues"
    )
    shap_values = shap_values[:,:-1]
    shap.summary_plot(
        shap_values, X_test, 
        # plot_type='bar', 
        plot_size=(15,15),
        max_display=30,
        show=False
    )

    # Log summary_plot
    plt.savefig(f'results/summary_plot_{course}.png', dpi=200, bbox_inches='tight', facecolor='#ffffff')
    with open('results/README.md', 'a+') as f:
        f.write(f'![summary_plot_{course}](summary_plot_{course}.png)\n')
    plt.clf()

    return model, X_test

In [14]:
# Clear the temporary results directory
for f in os.listdir('results'):
    os.remove(os.path.join('results', f))

Process all dataframes:

In [15]:
for course, data in data_process.items():
    print(f'Curso = {course}')
    model, X_test = catboost(
        data,
        attrs_course[course],
        attrs_cat_course[course],
        course
    )

Curso = engenharia_mecatrônica


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

Process a single dataframe:

In [None]:
# course = 'all'
# model, X_test = catboost(
#         data_process[course],
#         attrs_course[course],
#         attrs_cat_course[course],
#         course
#     )

## Using SHAP

In [None]:
# explainer = shap.TreeExplainer(model)
# start_index = 0
# end_index = len(X_test)
# shap_values = explainer.shap_values(X_test[start_index:end_index])

# shap.initjs()

In [None]:
# id = 15
# shap.force_plot(explainer.expected_value,
#                 shap_values[id:id+1],
#                 X_test[id:id+1],
#                 matplotlib = True,
#                 show = False)

# plt.savefig(f'results/explainer{id}.png', dpi=200, bbox_inches='tight', facecolor='#ffffff')

In [None]:
# shap.force_plot(explainer.expected_value,
#                 shap_values,
#                 X_test[start_index:end_index])

In [None]:
# shap.summary_plot(shap_values, X_test[start_index:end_index], plot_size=(20,20), max_display=50)

In [None]:
# shap.summary_plot(shap_values, X_test, plot_type='bar', max_display=50)

## Plot Student's House Coordinates

In [None]:
# attr = ['aluno', 'cep', 'dropout']
# # keys = data_course.keys()
# keys = ['all']
# for course in keys:
#     print(course)
#     data = data_course[course].copy()[attr].drop_duplicates()
#     utils.plot_coordinates(data, course)
# #     utils.plot_coordinates_density(data, course)