In [2]:
import pandas as pd

In [3]:
# Sample Dataset
data = {
    'Name': ['Ali', 'Sara', 'John', None, 'Mary'],
    'Age': [25, None, 30, 22, None],
    'City': ['KL', 'Penang', None, 'Johor', 'KL']
    }

In [4]:
df=pd.DataFrame(data)
print('Original DataFrame: \t ')
print()
df

Original DataFrame: 	 



Unnamed: 0,Name,Age,City
0,Ali,25.0,KL
1,Sara,,Penang
2,John,30.0,
3,,22.0,Johor
4,Mary,,KL


##### **Check The Missing Values Row by Row**

In [5]:
print('Rows with Missing Values:\n')
missing_rows = df[df.isnull().any(axis=1)]
print(missing_rows)

Rows with Missing Values:

   Name   Age    City
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [6]:
for index, row in df.iterrows():
    print(f"Row {index}: Missing = {row.isnull().any()}, Details = {row.isnull().to_dict()}")

Row 0: Missing = False, Details = {'Name': False, 'Age': False, 'City': False}
Row 1: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}
Row 2: Missing = True, Details = {'Name': False, 'Age': False, 'City': True}
Row 3: Missing = True, Details = {'Name': True, 'Age': False, 'City': False}
Row 4: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}


In [7]:
# Show Only Rows with Missing Data
missing_rows=df[df.isnull().any(axis=1)]
print('Rows with missing data:')
print()
print(missing_rows)

Rows with missing data:

   Name   Age    City
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [9]:
# Stimulate Rows that Would be Dropped Using dropna()
to_drop = df[df.isnull().any(axis=1)]
print('These rows would be dropped using dropna():')
print()
print(to_drop)

These rows would be dropped using dropna():

   Name   Age    City
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [11]:
df_cleaned = df.dropna()
print('After dropna():')
print()
print(df_cleaned)

After dropna():

  Name   Age City
0  Ali  25.0   KL


In [12]:
print('Before fillna():')
print(df)

Before fillna():
   Name   Age    City
0   Ali  25.0      KL
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [13]:
df_filled = df.fillna({
    'Name': 'Unknown',
    'Age': df['Age'].mean(),
    'City': 'Not Available'
})

In [15]:
print('After fillna():')
print()
print(df_filled)

After fillna():

      Name        Age           City
0      Ali  25.000000             KL
1     Sara  25.666667         Penang
2     John  30.000000  Not Available
3  Unknown  22.000000          Johor
4     Mary  25.666667             KL


In [16]:
df_sfill = df.fillna({
    'Name': 'Unknown',
    'Age': round(df['Age'].mean(),2),
    'City': 'Not Available'
})

In [17]:
print('After fillna(): age upto to decimal digit')
print()
print(df_sfill)

After fillna(): age upto to decimal digit

      Name    Age           City
0      Ali  25.00             KL
1     Sara  25.67         Penang
2     John  30.00  Not Available
3  Unknown  22.00          Johor
4     Mary  25.67             KL


In [18]:
import math

In [19]:
df_new = df.fillna({
    'Name': 'Unknown',
    'Age': math.floor((df['Age'].mean()).astype(int)),
    'City': 'Not Available'
})

In [20]:
print('After fillna(): floor without decimal')
print()
print(df_new)

After fillna(): floor without decimal

      Name   Age           City
0      Ali  25.0             KL
1     Sara  25.0         Penang
2     John  30.0  Not Available
3  Unknown  22.0          Johor
4     Mary  25.0             KL


In [21]:
df_new = df.fillna({
    'Name': 'Unknown',
    'Age': math.ceil((df['Age'].mean()).astype(int)),
    'City': 'Not Available'
})

In [22]:
print('\nAfter fillna(): floor without decimal')
print(df_new)


After fillna(): floor without decimal
      Name   Age           City
0      Ali  25.0             KL
1     Sara  25.0         Penang
2     John  30.0  Not Available
3  Unknown  22.0          Johor
4     Mary  25.0             KL


##### **Data Cleaning Exercise**
##### 1. Use students_performance_dirty.csv
##### 2. Get Basic Information about Dataset
##### 3. print('Missing Values per Column:')
##### 4. Check Missing Data Line by Line
##### 5. Drop Missing Rows (if any)
##### 6. Compare Bfr and Aftr
##### 7. Fill Missing Values
##### 8. Compare Bfr and Aftr 

In [24]:
df=pd.read_csv('students_performance_dirty.csv')
print('Dataset Loaded Successfully!')
df

Dataset Loaded Successfully!


Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
5,male,1.4,93.7,77.0,72.0,39.0
6,male,,87.1,95.0,78.0,43.4
7,femle,,70.6,59.0,70.0,32.8
8,female,2.7,74.7,68.0,71.0,35.3
9,female,4.2,92.4,63.0,66.0,35.0


In [25]:
print('--- Dataset Info ---')
df.info

--- Dataset Info ---


<bound method DataFrame.info of     gender  study_hours  attendance_pct  math_score  reading_score  \
0     male          1.7            83.6        62.0           91.0   
1      NaN          2.6            91.2        82.0           60.0   
2   female          2.9            97.5        69.0           57.0   
3   female          4.8            85.7        78.0           62.0   
4     male          3.9           -10.0        64.0           95.0   
5     male          1.4            93.7        77.0           72.0   
6     male          NaN            87.1        95.0           78.0   
7    femle          NaN            70.6        59.0           70.0   
8   female          2.7            74.7        68.0           71.0   
9   female          4.2            92.4        63.0           66.0   
10    male          4.7            67.9        94.0           73.0   
11     NaN          4.6            86.2        63.0           78.0   
12   femle          NaN            95.0        59.0       

In [26]:
print('Missing Values per Column:')
print(df.isnull().sum())

Missing Values per Column:
gender            8
study_hours       8
attendance_pct    0
math_score        0
reading_score     0
final_score       0
dtype: int64


In [27]:
# Missing Data Line by Line
for index, row in df.iterrows():
    if row.isnull().any():
        print(f'Row {index}: Missing Data -> {row.isnull().to_dict()}')

Row 1: Missing Data -> {'gender': True, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 6: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 7: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 11: Missing Data -> {'gender': True, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 12: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 13: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 20: Missing Data -> {'gender': False, 'study_hours': True, 'attendanc

In [28]:
df_dropped = df.dropna()

In [29]:
print('Shape Before dropna:', df.shape)
print('Shape After dropna:', df_dropped.shape)

Shape Before dropna: (60, 6)
Shape After dropna: (45, 6)


In [30]:
df

Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
5,male,1.4,93.7,77.0,72.0,39.0
6,male,,87.1,95.0,78.0,43.4
7,femle,,70.6,59.0,70.0,32.8
8,female,2.7,74.7,68.0,71.0,35.3
9,female,4.2,92.4,63.0,66.0,35.0


In [31]:
df_filled = df.fillna({
    'gender': 'Unknown',
    'study_hours': df['study_hours'].mean()
   })

In [32]:
df_filled

Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,Unknown,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
5,male,1.4,93.7,77.0,72.0,39.0
6,male,3.413462,87.1,95.0,78.0,43.4
7,femle,3.413462,70.6,59.0,70.0,32.8
8,female,2.7,74.7,68.0,71.0,35.3
9,female,4.2,92.4,63.0,66.0,35.0


In [33]:
for i in range(len(df)):
    print(f"\nRow {i} Before: {df.iloc[i].to_dict()}")
    print(f"Row {i} After : {df_filled.iloc[i].to_dict()}")


Row 0 Before: {'gender': 'male', 'study_hours': 1.7, 'attendance_pct': 83.6, 'math_score': 62.0, 'reading_score': 91.0, 'final_score': 38.9}
Row 0 After : {'gender': 'male', 'study_hours': 1.7, 'attendance_pct': 83.6, 'math_score': 62.0, 'reading_score': 91.0, 'final_score': 38.9}

Row 1 Before: {'gender': nan, 'study_hours': 2.6, 'attendance_pct': 91.2, 'math_score': 82.0, 'reading_score': 60.0, 'final_score': 37.5}
Row 1 After : {'gender': 'Unknown', 'study_hours': 2.6, 'attendance_pct': 91.2, 'math_score': 82.0, 'reading_score': 60.0, 'final_score': 37.5}

Row 2 Before: {'gender': 'female', 'study_hours': 2.9, 'attendance_pct': 97.5, 'math_score': 69.0, 'reading_score': 57.0, 'final_score': 35.0}
Row 2 After : {'gender': 'female', 'study_hours': 2.9, 'attendance_pct': 97.5, 'math_score': 69.0, 'reading_score': 57.0, 'final_score': 35.0}

Row 3 Before: {'gender': 'female', 'study_hours': 4.8, 'attendance_pct': 85.7, 'math_score': 78.0, 'reading_score': 62.0, 'final_score': 36.5}
Row