In [1]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("spscientist/students-performance-in-exams")
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/spscientist/students-performance-in-exams/versions/1


In [3]:
import os
import pandas as pd
df = pd.read_csv(os.path.join(path, "StudentsPerformance.csv"))

In [4]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [6]:
df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [11]:
X = df.drop('math score', axis=1)
y = df['math score']

In [12]:
df.dtypes

Unnamed: 0,0
gender,object
race/ethnicity,object
parental level of education,object
lunch,object
test preparation course,object
math score,int64
reading score,int64
writing score,int64


In [14]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])

In [18]:
model.fit(X_train,y_train)

In [25]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 35.1013161301729
R-squared: 0.855751019443816


In [26]:
from sklearn.linear_model import LinearRegression
model1 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

In [27]:
model1.fit(X_train,y_train)

In [28]:
y_pred_1 = model1.predict(X_test)
mse = mean_squared_error(y_test, y_pred_1)
r2 = r2_score(y_test, y_pred_1)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 29.095169866715523
R-squared: 0.8804332983749563


In [29]:
import joblib
joblib.dump(model, 'student_performance.pkl')

['student_performance.pkl']

In [30]:
!ls -lh student_performance.pkl

-rw-r--r-- 1 root root 6.4M Feb  6 14:37 student_performance.pkl


In [31]:
# Downloading the Model
from google.colab import files
files.download('student_performance.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
!git clone https://github.com/your-username/your-repo.git

Cloning into 'your-repo'...
fatal: could not read Username for 'https://github.com': No such device or address
