In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np

file_path = 'data/StudentsPerformance.csv' 
df = pd.read_csv(file_path)

df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
963,female,group C,some high school,free/reduced,completed,65,76,75
964,male,group D,some college,standard,none,72,57,58
965,female,group D,some college,standard,none,62,70,72
966,male,group A,some high school,standard,completed,66,68,64


In [3]:
TARGET = "math score"

X = df.drop(columns=[TARGET])
y = df[TARGET]

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

cv_scores = cross_val_score(model_pipeline, X, y, cv=10, scoring='neg_mean_absolute_error')
print(f"CV Mean MAE: {-cv_scores.mean():.2f} +- {cv_scores.std()/np.sqrt(len(cv_scores)):.2f}")

CV Mean MAE: 4.31 +- 0.11


In [None]:
import pickle

model_pipeline.fit(X, y)

with open(f"{TARGET}_model.pkl", 'wb') as file:
    pickle.dump(model_pipeline, file)

# load the model
with open(f"{TARGET}_model.pkl", 'rb') as file:
    model_pipeline = pickle.load(file)