# ETL Part

In [1]:

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

In [2]:
df_raw = pd.read_csv('C:/Users/prana/OneDrive/Desktop/New folder/Zaalima Dev/project1/kafka-student-performance/StudentPerformanceFactors.csv')

##   Extract

In [3]:
print("✅ Data Extracted")
print("✅Shape of Raw Data:", df_raw.shape)
display(df_raw.head())

✅ Data Extracted
✅Shape of Raw Data: (6607, 20)


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


## === 🔁 TRANSFORM ===

In [4]:
# Separate numerical columns
num_cols = df_raw.select_dtypes(include=['int64', 'float64']).columns
print(" Numerical Columns:", list(num_cols))


 Numerical Columns: ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score']


In [5]:
# Separate categorical columns
cat_cols = df_raw.select_dtypes(include='object').columns
print("Categorical Columns:", list(cat_cols))

Categorical Columns: ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']


In [6]:
# Handle Missing Values

# Imputer for numeric columns (use mean)
imputer_num = SimpleImputer(strategy='mean')
df_raw[num_cols] = imputer_num.fit_transform(df_raw[num_cols])

# Imputer for categorical columns (use most frequent)
imputer_cat = SimpleImputer(strategy='most_frequent')
df_raw[cat_cols] = imputer_cat.fit_transform(df_raw[cat_cols])

print("✅ Missing values handled")
print(df_raw.isnull().sum())


✅ Missing values handled
Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
Exam_Score                    0
dtype: int64


In [7]:
# Step 3: Encode Categorical Columns using Label Encoding

# Initialize label encoder
le = LabelEncoder()

# Encode each categorical column
for col in cat_cols:
    df_raw[col] = le.fit_transform(df_raw[col])

print("✅ Categorical columns encoded")
display(df_raw.head())


✅ Categorical columns encoded


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23.0,84.0,1,0,0,7.0,73.0,1,1,0.0,1,2,1,2,3.0,0,1,2,1,67.0
1,19.0,64.0,1,2,0,8.0,59.0,1,1,2.0,2,2,1,0,4.0,0,0,1,0,61.0
2,24.0,98.0,2,2,1,7.0,91.0,2,1,2.0,2,2,1,1,4.0,0,2,2,1,74.0
3,29.0,89.0,1,2,1,8.0,98.0,2,1,1.0,2,2,1,0,4.0,0,1,1,1,71.0
4,19.0,92.0,2,2,1,6.0,65.0,2,1,3.0,2,0,1,1,4.0,0,0,2,0,70.0


In [8]:
# Step 4: Scale Numerical Features

# Initialize standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Apply scaling
df_raw[num_cols] = scaler.fit_transform(df_raw[num_cols])

print("✅ Numerical features scaled")
display(df_raw[num_cols].head())


✅ Numerical features scaled


Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
0,0.504942,0.348375,-0.019796,-0.1438,-1.213934,0.031411,-0.060578
1,-0.162822,-1.383736,0.661399,-1.11611,0.411451,1.001199,-1.602931
2,0.671882,1.560853,-0.019796,1.106313,0.411451,1.001199,1.738833
3,1.506587,0.781403,0.661399,1.592469,-0.401242,1.001199,0.967657
4,-0.162822,1.04122,-0.70099,-0.699406,1.224144,1.001199,0.710598


## === 💾 LOAD ===

In [9]:


# Save the cleaned/transformed data to a new CSV file
output_path = 'C:/Users/prana/OneDrive/Desktop/New folder/Zaalima Dev/project1/kafka-student-performance/Transformed_StudentPerformance.csv'
df_raw.to_csv(output_path, index=False)

print(f"✅ Transformed data saved to '{output_path}'")


✅ Transformed data saved to 'C:/Users/prana/OneDrive/Desktop/New folder/Zaalima Dev/project1/kafka-student-performance/Transformed_StudentPerformance.csv'
