In [5]:
# Step 1: Import libraries
import pandas as pd

# Step 2: Load the dataset
url = "https://raw.githubusercontent.com/rashida048/Datasets/refs/heads/master/StudentsPerformance.csv"
df = pd.read_csv(url)

# Step 3: Show dataset shape and first few rows
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (1000, 8)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
# Step 1: Check for missing values
print(df.isnull().sum())

# Step 2: View data types
print("\nData Types:\n", df.dtypes)

# Step 3: Encode categorical features using Label Encoding or One-Hot Encoding
# We'll use one-hot encoding for simplicity
df_encoded = pd.get_dummies(df, drop_first=True)

# Step 4: Show encoded columns
print("\nEncoded Columns:\n", df_encoded.columns)

# Step 5: Define Features (X) and Target (y)
# Let's predict 'math score' as the target
X = df_encoded.drop('math score', axis=1)
y = df_encoded['math score']

# Final check
print("\nFeatures shape:", X.shape)
print("Target shape:", y.shape)


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

Data Types:
 gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

Encoded Columns:
 Index(['math score', 'reading score', 'writing score', 'gender_male',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level o

In [7]:
# Step 1: Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 4: Predict on test data
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)


Mean Squared Error: 29.095169866715487
R² Score: 0.8804332983749565


In [8]:
# Create a dataframe of feature importances
coeff_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})

# Sort by importance
coeff_df.sort_values(by='Coefficient', ascending=False)


Unnamed: 0,Feature,Coefficient
2,gender_male,13.064884
6,race/ethnicity_group E,4.892649
12,lunch_standard,3.510075
13,test preparation course_none,3.289642
10,parental level of education_some college,0.998856
8,parental level of education_high school,0.929312
11,parental level of education_some high school,0.75647
1,writing score,0.724148
3,race/ethnicity_group B,0.359323
0,reading score,0.236023


# 🎓 Predictive Analysis using Machine Learning
## Student Math Score Prediction using Linear Regression


### 📌 Objective
Build a regression model to predict students' math scores based on demographic and academic factors.

### 🧪 Dataset
- Source: [GitHub - Students Performance](https://raw.githubusercontent.com/rashida048/Datasets/refs/heads/master/StudentsPerformance.csv)
- 1000 records with features like gender, race, lunch type, parental education, etc.

### 🔍 Model Used
- Linear Regression (Scikit-learn)

### 📈 Model Performance
- R² Score: 0.88
- Mean Squared Error: 29.09

### 🔎 Key Insights
- Features like **reading and writing scores**, **test preparation**, and **parental education** significantly affect math scores.
