<a href="https://colab.research.google.com/github/denistoo749/Academic-Success-Classification/blob/main/academic_success.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification with an Academic Success Dataset
**1. Problem**
- Predict academic risk of students in higher education.

**2. Data**
- Files
```
train.csv - the training dataset; Target is the categorical target
test.csv - the test dataset; your objective is to predict the class of Target for each row
sample_submission.csv - a sample submission file in the correct format
```

>https://www.kaggle.com/competitions/playground-series-s4e6/data

**3. Evaluation**
- Submissions are evaluated using the accuracy score.


In [None]:
# # Unzip the file
# !unzip '/content/drive/MyDrive/Academic Success Classification/playground-series-s4e6.zip' -d '/content/drive/MyDrive/Academic Success Classification/data/'

Import necessary tools

In [None]:
# Import necessary tools
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Read the train dataset
df = pd.read_csv('/content/drive/MyDrive/Academic Success Classification/data/train.csv')
df.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [None]:
df.isna().sum()

id                                                0
Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship 

In [None]:
# Create X and y
X = df.drop('Target', axis=1)
y = df['Target']

# Preprocess data

In [None]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((61214, 37), (15304, 37), (61214,), (15304,))

In [None]:
# Feature scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Modelling

In [None]:
# Set the seed
np.random.seed(42)

# Put the model in dictionary
models = {'RandomForestClassifier': RandomForestClassifier(),
          'LogisticRegression': LogisticRegression()}

def fit_and_score(model):
  # Initialize the dictionary that keep model scores
  model_scores = {}

  for name, model in models.items():
    # Fit the model
    model.fit(x_train, y_train)

    # Evaluate the model
    model_scores[name] = model.score(x_test, y_test)

  return model_scores

In [None]:
fit_and_score(models)

{'RandomForestClassifier': 0.8277574490329326,
 'LogisticRegression': 0.8190015682174595}

## Prediction

In [None]:
# Read the test data
test_df = pd.read_csv('/content/drive/MyDrive/Academic Success Classification/data/test.csv')
test_df.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,...,0,0,8,0,0,0.0,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,...,0,0,6,6,6,13.5,0,11.1,0.6,2.02
2,76520,1,1,1,9238,1,1,118.0,1,1,...,0,0,6,11,5,11.0,0,15.5,2.8,-4.06
3,76521,1,44,1,9147,1,39,130.0,1,1,...,0,3,8,14,5,11.0,0,8.9,1.4,3.51
4,76522,1,39,1,9670,1,1,110.0,1,1,...,0,0,6,9,4,10.666667,2,7.6,2.6,0.32


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51012 entries, 0 to 51011
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              51012 non-null  int64  
 1   Marital status                                  51012 non-null  int64  
 2   Application mode                                51012 non-null  int64  
 3   Application order                               51012 non-null  int64  
 4   Course                                          51012 non-null  int64  
 5   Daytime/evening attendance                      51012 non-null  int64  
 6   Previous qualification                          51012 non-null  int64  
 7   Previous qualification (grade)                  51012 non-null  float64
 8   Nacionality                                     51012 non-null  int64  
 9   Mother's qualification                 

In [None]:
test_df.isna().sum()

id                                                0
Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship 

In [None]:
y_preds = models['RandomForestClassifier'].predict(test_df)



In [None]:
y_preds

array(['Dropout', 'Graduate', 'Graduate', ..., 'Dropout', 'Graduate',
       'Dropout'], dtype=object)

In [None]:
y_preds.shape

(51012,)

In [None]:
submission = pd.DataFrame({'id': test_df['id'], 'Target': y_preds})
submission.to_csv('/content/drive/MyDrive/Academic Success Classification/data/submission.csv', index=False)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Academic Success Classification/data/submission.csv')
data.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Graduate


# Hyperparameter Tuning using RandomizedSearchCV

In [None]:
np.random.seed(42)

grid = {
    'n_estimators': [100, 200],  # Number of trees
    'max_features': ['auto', 'sqrt'],  # Number of features to consider at each split
    'max_depth': [10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 4],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required at each leaf node
    'bootstrap': [True]  # Whether bootstrap samples are used when building trees
}

# Setup RandomizedSearchCV
rs_rf = RandomizedSearchCV(estimator=models['RandomForestClassifier'],
                           param_distributions=grid,
                           n_iter=10, # number of models to try
                           cv=5,
                           verbose=True,
                           n_jobs=2)

# Fit the RandomizedSearchCV version
rs_rf.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
rs_rf.best_params_

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [None]:
rs_y_preds = rs_rf.predict(test_df)



In [None]:
rs_y_preds

array(['Dropout', 'Graduate', 'Graduate', ..., 'Dropout', 'Graduate',
       'Dropout'], dtype=object)

In [None]:
submission = pd.DataFrame({'id': test_df['id'], 'Target': rs_y_preds})
submission.to_csv('/content/drive/MyDrive/Academic Success Classification/data/submission.csv', index=False)

#  Hyperparameters tuning with GridSearchCV

In [None]:
np.random.seed(42)

grid = {
    'n_estimators': [200, 300],  # Number of trees
    'max_features': ['auto', 'sqrt'],  # Number of features to consider at each split
    'max_depth': [20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Setup RandomizedSearchCV
gs_rf = GridSearchCV(estimator=models['RandomForestClassifier'],
                           param_grid=grid,
                           n_jobs=-1,
                           cv=5,
                           verbose=True)

# Fit the RandomizedSearchCV version of clf
gs_rf.fit(x_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits




In [None]:
gs_rf.best_params_

In [None]:
gs_y_preds = gs_rf.predict(test_df)

In [None]:
gs_y_preds

In [None]:
# submission = pd.DataFrame({'id': test_df['id'], 'Target': gs_y_preds})
# submission.to_csv('/content/drive/MyDrive/Academic Success Classification/data/submission.csv', index=False)