# Importação da bibliotecas

In [1]:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from statstests.process import stepwise
from sklearn.metrics import confusion_matrix, accuracy_score,\
    ConfusionMatrixDisplay, recall_score
pd.set_option("display.max_rows", None)

# Leitura do DataSet e visualização dos dados

In [None]:
#'/kaggle/input/student-depression-dataset/Student Depression Dataset.csv'
dataset_path = 'Student Depression Dataset.csv'
df = pd.read_csv(dataset_path)
df.head()

## Verificando o tipo de variaveis

In [None]:
df.info()

## Removendo valores nulos

In [434]:
df.dropna( inplace=True)

## Renomeando as colunas

In [433]:
df.rename(columns={
    'Academic Pressure': 'AP',
    'Work Pressure':'WP',
    'Study Satisfaction':'SS',
    'Job Satisfaction':'JS',
    'Sleep Duration':'SD',
    'Dietary Habits':'DH',
    'Have you ever had suicidal thoughts ?':'ST',
    'Work/Study Hours':'WSH',
    'Financial Stress':'FS',
    'Family History of Mental Illness':'FHMI'
},inplace=True)

## Removendo a coluna ID

In [435]:
df.drop(columns='id',inplace=True)

# Analisando as variaveis quantitativas

## Age

In [None]:
df['Age'].value_counts().sort_index()

In [440]:
df.query('Age < 35',inplace=True)

In [None]:
df['Age'].describe()

## CGPA

sns.boxplot(data=df['CGPA'])

df['CGPA'].describe()

df['CGPA'].value_counts()

In [None]:
filtered_cgpa = df['CGPA'].value_counts()[lambda x: x > 9].index.tolist()
df.query('CGPA in @filtered_cgpa',inplace=True)

# Variaveis Categoricas

## Gender

In [None]:
df['Gender'].value_counts().sort_index()

In [None]:
df['Gender'] = df['Gender'].replace({'Female':0,'Male':1}).astype('int64')

## City

In [None]:
df['City'].value_counts()

In [443]:
filtered_cities = df['City'].value_counts()[lambda x: x > 2].index.tolist()
df.query('City in @filtered_cities',inplace=True)

In [None]:
df['City'].value_counts()

## Profession

In [None]:
df['Profession'].value_counts()

In [446]:
df.query('Profession == "Student"',inplace=True)
df.drop(columns=['Profession'],axis=1,inplace=True)

## Academic Pressure

In [None]:
df['AP'].value_counts().sort_index()

In [448]:
df.query('AP > 0.0 ', inplace=True)

## Work Pressure 

In [None]:
df['WP'].value_counts().sort_index()

In [450]:
df.drop(columns=['WP'],inplace=True)

## Study Satisfaction

In [None]:
df['SS'].value_counts().sort_index()

In [400]:
df.query('SS > 0.0 ', inplace=True)

## Job Satisfaction 

In [None]:
df['JS'].value_counts().sort_index()

In [402]:
df.drop(columns=['JS'],inplace=True)

## Sleep Duration

In [None]:
df['SD'].value_counts().sort_index()

In [404]:
df.query('SD != "Others" ', inplace=True)

In [405]:
mapping = {
    'Less than 5 hours': 1,
    '5-6 hours': 2,
    '7-8 hours': 3,
    'More than 8 hours': 4
}

df['SD'] = df['SD'].map(mapping)

## Dietary Habits

In [None]:
df['DH'].value_counts().sort_index()

In [407]:
df.query('DH != "Others" ', inplace=True)

In [408]:
mapping = {
    'Healthy': 0,
    'Moderate': 1,
    'Unhealthy': 3
}

df['DH'] = df['DH'].map(mapping)

## Degree

In [None]:
df['Degree'].value_counts().sort_index(ascending=False)

## Have you ever had suicidal thoughts ?

In [None]:
df['ST'].value_counts().sort_index()

In [411]:
mapping = {
    'No':0,
    'Yes':1
}

df['ST'] = df['ST'].map(mapping)

## Work/Study Hours

In [None]:
df['WSH'].value_counts().sort_index()


## Financial Stress

In [None]:
df['FS'].value_counts().sort_index()

## Family History of Mental Illness

In [None]:
df['FHMI'].value_counts().sort_index()

In [None]:
df['FHMI']= df['FHMI'].replace({'No':0,'Yes':1}).astype('int64')

## Depression

In [None]:
df['Depression'].value_counts().sort_index()

# Implementação do Algoritimos

## Realizando processo de dummie para a variavel cidade

In [419]:
df_dummies = pd.get_dummies(df, columns=['City'], dtype=int, drop_first=True)

In [None]:
df_dummies.shape

## Modelo Logisitico

### Gerando formula para o modelo

In [421]:
df_dummies.columns = df_dummies.columns.str.replace('-', '_', regex=True)

list_colums_dummies = list(df_dummies.drop(columns=['Depression']).columns)
formula_dummies_model = ' + '.join(list_colums_dummies)
formula_dummies_model = 'Depression ~ ' + formula_dummies_model

### Realizando o treino do modelo:

In [None]:
model = sm.Logit.from_formula(formula_dummies_model,df_dummies).fit()

model.summary()

### Executando para o processo de stepwise no modelo para melhoramento

In [None]:
stepwise_model = stepwise(model,pvalue_limit=0.05)

In [None]:
stepwise_model.summary()

### Analisando os resultados do modelo

In [426]:
def matriz_confusao(predicts, observado, cutoff):
    
    values = predicts.values
    
    predicao_binaria = []
        
    for item in values:
        if item < cutoff:
            predicao_binaria.append(0)
        else:
            predicao_binaria.append(1)
           
    cm = confusion_matrix(predicao_binaria, observado)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.xlabel('True')
    plt.ylabel('Classified')
    plt.gca().invert_xaxis()
    plt.gca().invert_yaxis()
    plt.show()
        
    sensitividade = recall_score(observado, predicao_binaria, pos_label=1)
    especificidade = recall_score(observado, predicao_binaria, pos_label=0)
    acuracia = accuracy_score(observado, predicao_binaria)

    #Visualizando os principais indicadores desta matriz de confusão
    indicadores = pd.DataFrame({'Sensitividade':[sensitividade],
                                'Especificidade':[especificidade],
                                'Acurácia':[acuracia]})
    return indicadores

In [427]:
df_dummies['phat'] = stepwise_model.predict()

In [None]:
matriz_confusao(observado=df_dummies['Depression'],
                predicts=df_dummies['phat'],
                cutoff=0.5)