useful sources:
[Youtube Vidoe](https://youtu.be/GP-2634exqA?si=UBOKqQcrudjINcL4)
[Working with tabular data in Python](https://wandb.ai/mostafaibrahim17/ml-articles/reports/Working-with-tabular-data-in-Python--Vmlldzo4MTU4OTgx)

#Step 1: Setting up the Python environment

In [None]:
!pip install basemap
!pip install scikeras
!pip install tensorflow

Collecting basemap
  Downloading basemap-1.4.1-cp311-cp311-manylinux1_x86_64.whl.metadata (9.1 kB)
Collecting basemap-data<1.4,>=1.3.2 (from basemap)
  Downloading basemap_data-1.3.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting matplotlib<3.9,>=1.5 (from basemap)
  Downloading matplotlib-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting pyproj<3.7.0,>=1.9.3 (from basemap)
  Downloading pyproj-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting packaging<24.0,>=16.0 (from basemap)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading basemap-1.4.1-cp311-cp311-manylinux1_x86_64.whl (942 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m942.4/942.4 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading basemap_data-1.3.2-py2.py3-none-any.whl (30.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m57.4 MB/s[0m eta 

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

#Step 2: Data collection and preprocessing

In [77]:
# Loading Dataset
data = pd.read_csv("database.csv")
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


(2392, 15)

In [78]:
data.isnull().sum()/data.shape[0]*100# checking for percentage null value

Unnamed: 0,0
StudentID,0.0
Age,0.0
Gender,0.0
Ethnicity,0.0
ParentalEducation,0.0
StudyTimeWeekly,0.0
Absences,0.0
Tutoring,0.0
ParentalSupport,0.0
Extracurricular,0.0


In [79]:
# Handling Missing Values
data=data.interpolate(method ='linear', limit_direction ='forward') #fill in missing (NaN) values in your dataset
# we could also add median or mean or KNNimputer instead of NaN, but since we dont have any missing value we just keep this

In [None]:
#check for duplicates
data.duplicated().sum()
data.drop_duplicates(inplace=True)

In [80]:
#check for garbage value
#since we dont have any object type data, there would be no garbage value

 #Step 3: Exploratory data analysis (EDA)

In [None]:
# lets see the dataset statistic
data.describe()

In [None]:
#code for EDA

In [None]:
# to indentify outliners

In [None]:
#scatter plot to undrestand relationship

In [None]:
#correlation heatmaps

In [None]:
#outlier treatement: decided based on [35] if it is needed or not

#Step 4: Feature engineering

In [81]:
#from dataset key features include
data = data[['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
             'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
             'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA', 'GradeClass']]
data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [None]:
# do label and one hot encoding
# age, parentalsupport, parental education, ethinicity, garde are already changed to onehot encoding

In [82]:
#creating new features

#combine ParentalEducation and ParentalSupport into a single metric
data['ParentalInfluence'] = data['ParentalEducation'] * data['ParentalSupport']

# cheking later if this is a good idea or not based on the model!!!!
#categorizing absence into 4 levels
#data['AbsenceLevel'] = pd.cut(data['Absences'], bins=[0,3,7,14, 21], labels=['Ver Low','Low','Moderate', 'High'])

#combine Tutoring and StudyTimeWeekly into a single metric
data['TutoringEffect'] = data['Tutoring'] * data['StudyTimeWeekly']

# cheking later if this is a good idea or not based on the model!!!!
#categorizing StudyTimeWeekly into 3 level
#data['StudyTimeLevel'] = pd.cut(data['StudyTimeWeekly'], bins=[0,5,10,20], labels=['Low','Medium','High'])



In [None]:
# features scaling

#normalizng these columns so features with larger ranges (like StudyTimeWeekly)
#cannot dominate the model’s learning process, to make smaller-scale features (like GPA) less important

# not neccesay for XGBOOST or random forest
# scaler = StandardScaler()
# num_cols = ['StudyTimeWeekly', 'Absences', 'GPA']
# data[num_cols] = scaler.fit_transform(data[num_cols])

In [92]:
from sklearn.ensemble import RandomForestClassifier

#feature selection
# which features contribute the most to the model’s predictions for GradeClass
X = data.drop(columns=['GradeClass'])
y = data['GradeClass']

model = RandomForestClassifier()
model.fit(X, y)

# print features importance
importance = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_}).sort_values(by='Importance', ascending=False)
print(importance)


              Feature  Importance
13                GPA    0.521874
6            Absences    0.214892
0           StudentID    0.085139
5     StudyTimeWeekly    0.043836
15     TutoringEffect    0.018895
14  ParentalInfluence    0.018887
8     ParentalSupport    0.017186
1                 Age    0.014233
3           Ethnicity    0.014096
4   ParentalEducation    0.012188
10             Sports    0.007733
9     Extracurricular    0.007524
2              Gender    0.006403
11              Music    0.006350
7            Tutoring    0.005865
12       Volunteering    0.004900


#Step 5: Building the machine learning model

In [None]:
# feature selection

In [None]:
from sklearn.model_selection import train_test_split
import wandb
from wandb.integration.keras import WandbEvalCallback, WandbMetricsLogger
from sklearn.model_selection import train_test_split
import wandb
from wandb.integration.keras import WandbEvalCallback, WandbMetricsLogger

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Step 6. Evaluating the model using Weights & Biases