In [2]:
# Step 1: Import Libraries

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.metrics import mean_squared_error, accuracy_score

from sklearn.preprocessing import LabelEncoder

# Step 2: Load Dataset

df = pd.read_csv("StudentsPerformance.csv")

In [4]:

# Step 3: Explore Data

print("First 5 rows:")

print(df.head())

print("\nInfo: ")

print(df.info())

print("\nSummary Statistics:")

print(df.describe())



First 5 rows:
   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  

Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                     

In [6]:
# Step 4: Preprocessing Convert categorical to numeric

le = LabelEncoder()

for col in df.select_dtypes (include='object').columns:
  df[col] = le.fit_transform(df[col])

In [8]:
# Step 5: Choose Target and Features

# Example: Predict math score using other features

x = df.drop('math score', axis=1)

y = df['math score']

In [9]:
# Step 6: Split Train/Test

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
# Step 7: Train Linear Regression

lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [12]:
# Step 8: Evaluate (Regression)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nLinear Regression RMSE:", rmse)


Linear Regression RMSE: 5.317450940660133


In [13]:

# Step 9: Try Decision Tree & Random Forest

dt = DecisionTreeRegressor (random_state=42)

rf = RandomForestRegressor (random_state=42, n_estimators=100)

dt.fit(X_train, y_train)

rf.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

y_pred_rf = rf.predict(X_test)

print("Decision Tree RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_dt)))

print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))

Decision Tree RMSE: 7.801281945936834
Random Forest RMSE: 6.065824068029948


In [16]:
#step 10: cross-validation for reliability
scores = cross_val_score(rf, x, y, cv=5, scoring='neg_root_mean_squared_error')
print("/n Random Forest S-fold CV RMSE:", -scores.mean())

/n Random Forest S-fold CV RMSE: 6.116430011483926
