# Análise de Performance de Estudantes

#### Importações 

In [9]:
from repository.loader_datasource import load_csv_data
from services.dataframe_formatter import header_formatter

import plotly.graph_objs as go
import plotly.express as px
import pandas as pd

#### Carregamento e Tratativa dos Dados

In [10]:
# Define the path to the datasource
path = '../datasource/StudentsPerformance.csv'

# Call the function to load the data and format the header
df = header_formatter(load_csv_data(path))


In [11]:
df.isnull().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [12]:
# Add a new column to the dataframe with the average of the three tests
df['final_score_avg'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1).round()

df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,final_score_avg
0,female,group B,bachelor's degree,standard,none,72,72,74,73.0
1,female,group C,some college,standard,completed,69,90,88,82.0
2,female,group B,master's degree,standard,none,90,95,93,93.0
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.0
4,male,group C,some college,standard,none,76,78,75,76.0
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,94.0
996,male,group C,high school,free/reduced,none,62,55,55,57.0
997,female,group C,high school,free/reduced,completed,59,71,65,65.0
998,female,group D,some college,standard,completed,68,78,77,74.0


#### Analise Exploratória

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race_ethnicity               1000 non-null   object 
 2   parental_level_of_education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test_preparation_course      1000 non-null   object 
 5   math_score                   1000 non-null   int64  
 6   reading_score                1000 non-null   int64  
 7   writing_score                1000 non-null   int64  
 8   final_score_avg              1000 non-null   float64
dtypes: float64(1), int64(3), object(5)
memory usage: 70.4+ KB


In [14]:
df.describe()

Unnamed: 0,math_score,reading_score,writing_score,final_score_avg
count,1000.0,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054,67.762
std,15.16308,14.600192,15.195657,14.258354
min,0.0,17.0,10.0,9.0
25%,57.0,59.0,57.75,58.0
50%,66.0,70.0,69.0,68.0
75%,77.0,79.0,79.0,78.0
max,100.0,100.0,100.0,100.0


#### Analise Gráfica

In [15]:
# Group the final_score_avg by gender
df_final_avg_grouped_by_gender = df.groupby(['gender', 'race_ethnicity'])['final_score_avg'].mean().reset_index()

# Create a plotly bar chart
fig = px.bar(df_final_avg_grouped_by_gender, 
             x='race_ethnicity', y='final_score_avg', 
             text='final_score_avg', 
             title='Comparação de Médias Finais por Gênero e Etnia',  
             labels={'final_score_avg': 'Média de Notas Finais', 'gender': 'Gênero'}, 
             color='gender', barmode='group')

fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')

fig.show()

#### Média de Pontuação Final por Gênero e Grupo Étnico
Em uma analise inicial, é possível perceber que de uma maneira global para todas as etinias o genero feminino possui destaque em relação ao genero masculino.

| gender | race_ethnicity | final_score_avg |
|--------|----------------|-----------------|
| female | group A        | 65.083333       |
| female | group B        | 67.548077       |
| female | group C        | 68.588889       |
| female | group D        | 71.441860       |
| female | group E        | 74.014493       |
| male   | group A        | 61.509434       |
| male   | group B        | 62.988372       |
| male   | group C        | 65.223022       |
| male   | group D        | 66.984962       |
| male   | group E        | 71.450704       |




In [30]:
# Create a plot scatter with the score distribution
fig = px.bar(df, x='final_score_avg', y='math_score', color='gender', title='Distribuição de Notas - Matemática em relação a Média Final', labels={'final_score_avg': 'Média de Notas Finais', 'reading_score': 'Nota de Leitura'})

fig.show()

Também é possível perceber que a média para as mulheres é maior que a média para os homens quando comparamos os dois grupos e olhando para o resultado final apresentado na disciplina de matemática.

In [17]:
fig = px.box(df, x="race_ethnicity", y="final_score_avg", color="gender",
             title="Distribuição da Pontuação Final por Gênero e Grupo Étnico",
             labels={"race_ethnicity": "Grupo Étnico", "final_score_avg": "Média da Pontuação Final"},
             points="all")  # Adiciona pontos individuais ao box plot

fig.show()

In [31]:
df_gender_groupeb_test_preparation = df.groupby(['gender', 'test_preparation_course'])['final_score_avg'].median().reset_index()

fig = px.bar(df_gender_groupeb_test_preparation, 
             x='test_preparation_course', 
             y='final_score_avg', 
             color='gender', 
             title='Distribuição da Pontuação Final por Gênero e Preparação para o Teste', 
             labels={'final_score_avg': 'Média da Pontuação Final'}, 
             barmode='group',
             text='final_score_avg',
             category_orders={'test_preparation_course': ['none', 'completed']})

fig.show()

- Criando uma tabela com a listagem de outliers

In [32]:
df_gender_groupeb_test_preparation

Unnamed: 0,gender,test_preparation_course,final_score_avg
0,female,completed,75.0
1,female,none,67.5
2,male,completed,71.0
3,male,none,63.0


In [33]:
from scipy.stats import zscore
from numpy import abs

z = abs(zscore(df['final_score_avg']))

df_outliers = df[z >= 3.0]

df_outliers.sort_values(by='final_score_avg', ascending=False)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,final_score_avg
327,male,group A,some college,free/reduced,none,28,23,19,23.0
596,male,group B,high school,free/reduced,none,30,24,15,23.0
980,female,group B,high school,free/reduced,none,8,24,23,18.0
59,female,group C,some high school,free/reduced,none,0,17,10,9.0


In [20]:
df_grouped_by_parental_level_of_education = df.groupby(['parental_level_of_education'])[['math_score', 'reading_score', 'writing_score', 'final_score_avg']].mean().reset_index()

df_grouped_by_parental_level_of_education

Unnamed: 0,parental_level_of_education,math_score,reading_score,writing_score,final_score_avg
0,associate's degree,67.882883,70.927928,69.896396,69.558559
1,bachelor's degree,69.389831,73.0,73.381356,71.949153
2,high school,62.137755,64.704082,62.44898,63.122449
3,master's degree,69.745763,75.372881,75.677966,73.576271
4,some college,67.128319,69.460177,68.840708,68.446903
5,some high school,63.497207,66.938547,64.888268,65.072626


In [34]:
# Plot a bar chart with df_grouped_by_parental_level_of_education data

fig = px.bar(df_grouped_by_parental_level_of_education, 
             x='parental_level_of_education', 
             y=['math_score', 'reading_score', 'writing_score', 'final_score_avg'], 
             title='Comparação de Médias por Nível de Educação dos Pais', 
             labels={'value': 'Média de Notas', 'variable': 'Area', 'parental_level_of_education': 'Nível de Educação dos Pais'}, 
             text_auto=True,
             barmode='group')

fig.show()

## Treinamento do Modelo

### Organizando os dados de Treino e Testes

In [38]:
# Transform features in dummies
df_dummies = pd.get_dummies(df, columns=['gender', 'lunch', 'parental_level_of_education', 'race_ethnicity', 'test_preparation_course'])

In [37]:
# Split target and features
X = df_dummies.drop(columns=['final_score_avg'])
y = df_dummies['final_score_avg']


In [39]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Regressão Logistica

In [63]:
# Train a linear regression model
from sklearn.linear_model import LinearRegression

model = LinearRegression()

lr_predict_result = model.fit(X_train, y_train)


In [None]:
coef = lr_predict_result.coef_

print(f' O Resultado dos coeficientes foram: {coef}')

 O Resultado dos coeficientes foram: [ 0.3352442   0.33271072  0.33144849  0.02753976 -0.02753976 -0.00350332
  0.00350332  0.00124485  0.04110922  0.0417924   0.00959069 -0.04002378
 -0.05371338 -0.01200568  0.02719482 -0.00212165  0.02322398 -0.03629146
  0.01307538 -0.01307538]


In [None]:
from sklearn.metrics import r2_score

# Calculate the R2 score
r2 = r2_score(y_test, model.predict(X_test))

print(f' Verificando o resultado de R2: {r2}')


 Verificando o resultado de R2: 0.9996941579270728


In [None]:
# Calculo do Errp Quadrático Médio (MSE - Mean Squared Error)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred=model.predict(X_test))

print(f' Verificando o resultado de MSE: {mse}')

 Verificando o resultado de MSE: 0.0689914844021844


In [68]:
# Raiz do Erro Quadrático Médio (RMSE - Root Mean Squared Error)
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_test, y_pred=model.predict(X_test))

print("Raiz do Erro Quadrático Médio (RMSE):", rmse)

Raiz do Erro Quadrático Médio (RMSE): 0.26266230106771016


In [69]:
# Erro Médio Absoluto (MAPE - Mean Absolute Percentage Error)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred=model.predict(X_test))

print(f'O Resultado foi:', mape)

O Resultado foi: 0.3485412701296795


In [74]:
# Gáfico de Residuos plotly

residuals = y_test - model.predict(X_test)

fig = px.scatter(x=y_test, y=residuals, title='Resíduos do Modelo de Regressão Linear', labels={'x': 'Valor Real', 'y': 'Resíduos'})

fig.show()

In [80]:
# Score do modelo
score = model.score(X_test, y_test)

print(f' O Score do modelo foi: {score}')

 O Score do modelo foi: 0.9996941579270728


In [None]:
# Classification report
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)



Score: 0.9996941579270728
