In [1]:
pip install pandas



In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/StudentsPerformance.csv")

In [4]:
print(df)

     gender race/ethnicity parental level of education         lunch  \
0    female        group B           bachelor's degree      standard   
1    female        group C                some college      standard   
2    female        group B             master's degree      standard   
3      male        group A          associate's degree  free/reduced   
4      male        group C                some college      standard   
..      ...            ...                         ...           ...   
995  female        group E             master's degree      standard   
996    male        group C                 high school  free/reduced   
997  female        group C                 high school  free/reduced   
998  female        group D                some college      standard   
999  female        group D                some college  free/reduced   

    test preparation course  math score  reading score  writing score  
0                      none          72             72         

In [5]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,
1,female,group C,some college,standard,completed,69,90,88.0
2,female,group B,master's degree,standard,none,90,95,93.0
3,male,group A,associate's degree,free/reduced,none,47,57,44.0
4,male,group C,some college,standard,none,76,78,75.0


In [6]:
df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,1


In [7]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,999.0
mean,66.089,69.169,68.048048
std,15.16308,14.600192,15.202102
min,0.0,17.0,10.0
25%,57.0,59.0,57.5
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [8]:
var_desp = {
    "gender": "Gender of student",
    "race/ethnicity":"Ethnic group of student",
    "parental level of education": "Highest education level of the student's parents",
    "lunch": "Type of lunch student receives",
    "test preparation course": "Whether the student completed a test prep course",
    "math score": "Score achieved in maths",
    "reading score": "Score achieved in reading",
    "writing score": "Score achieved in writing"
}

In [9]:
print("Variable Description: ")
for var,desc in var_desp.items():
    print(f"{var},{desc}")

Variable Description: 
gender,Gender of student
race/ethnicity,Ethnic group of student
parental level of education,Highest education level of the student's parents
lunch,Type of lunch student receives
test preparation course,Whether the student completed a test prep course
math score,Score achieved in maths
reading score,Score achieved in reading
writing score,Score achieved in writing


In [10]:
df.dtypes

Unnamed: 0,0
gender,object
race/ethnicity,object
parental level of education,object
lunch,object
test preparation course,object
math score,int64
reading score,int64
writing score,float64


In [11]:
df.shape

(1000, 8)

In [12]:
# Convert object columns to category type
categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
for col in categorical_columns:
    df[col] = df[col].astype('category')

In [13]:
# Ensure scores are numeric
numeric_columns = ['math score', 'reading score', 'writing score']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col])
#📌 This ensures all columns are in correct format.

In [14]:
#If normalization is needed:

# Min-Max Normalization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['math score', 'reading score', 'writing score']] = scaler.fit_transform(df[['math score', 'reading score', 'writing score']])

In [15]:
#Using one-hot encoding:

df_encoded = pd.get_dummies(df, drop_first=True)
#📌 This converts categories (like gender: male/female) into 0/1 numeric form.

In [16]:
print(df_encoded)

     math score  reading score  writing score  gender_male  \
0          0.72       0.662651            NaN        False   
1          0.69       0.879518       0.866667        False   
2          0.90       0.939759       0.922222        False   
3          0.47       0.481928       0.377778         True   
4          0.76       0.734940       0.722222         True   
..          ...            ...            ...          ...   
995        0.88       0.987952       0.944444        False   
996        0.62       0.457831       0.500000         True   
997        0.59       0.650602       0.611111        False   
998        0.68       0.734940       0.744444        False   
999        0.77       0.831325       0.844444        False   

     race/ethnicity_group B  race/ethnicity_group C  race/ethnicity_group D  \
0                      True                   False                   False   
1                     False                    True                   False   
2                 

In [17]:
df.dropna(axis=1)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score
0,female,group B,bachelor's degree,standard,none,0.72,0.662651
1,female,group C,some college,standard,completed,0.69,0.879518
2,female,group B,master's degree,standard,none,0.90,0.939759
3,male,group A,associate's degree,free/reduced,none,0.47,0.481928
4,male,group C,some college,standard,none,0.76,0.734940
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,0.88,0.987952
996,male,group C,high school,free/reduced,none,0.62,0.457831
997,female,group C,high school,free/reduced,completed,0.59,0.650602
998,female,group D,some college,standard,completed,0.68,0.734940
