In [13]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

student_performance = fetch_ucirepo(id=320)
X = student_performance.data.features
y = student_performance.data.targets

print("\nFirst 5 rows of features:")
print(X.head())

Fetching dataset...

First 5 rows of features:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  higher internet  romantic  famrel  freetime goout Dalc Walc health absences  
0    yes       no        no       4         3     4    1    1      3        4  
1    yes      yes        no       5         3     3    1    1      3        2  
2    yes      yes        no       4         3     2    2    3      3        6  
3    yes      yes       yes       3         2     2    1    1      5        0  
4    yes       no        no       4         3     2    1    2     

In [10]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("New column names:")
print(X_encoded.columns.tolist())

print("\nFirst 5 rows of encoded data:")
print(X_encoded.head())

target_col = 'G3' if 'G3' in y.columns else y.columns[0]
y_target = y[target_col]

model_columns = list(X_encoded.columns)

New column names:
['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'school_MS', 'sex_M', 'address_U', 'famsize_LE3', 'Pstatus_T', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_mother', 'guardian_other', 'schoolsup_yes', 'famsup_yes', 'paid_yes', 'activities_yes', 'nursery_yes', 'higher_yes', 'internet_yes', 'romantic_yes']

First 5 rows of encoded data:
   age  Medu  Fedu  traveltime  studytime  failures  famrel  freetime  goout  \
0   18     4     4           2          2         0       4         3      4   
1   17     1     1           1          2         0       5         3      3   
2   15     1     1           1          2         0       4         3      2   
3   15     4     2           1          3         0       3         2      2   
4   16     3     3   

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_target, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [12]:
model_bundle = {
    "model": rf_model,
    "columns": model_columns
}

with open('model.pkl', 'wb') as f:
    pickle.dump(model_bundle, f)