In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Global variables
separator = "_" * 20

# Given Dataset
data = {
    "Student_ID": ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'],
    "Age": [20, 21, None, 19, 22, "twenty", 23, 24, None],
    "Score": [85, None, 78, 90, 88, 92, np.nan, 87, 80],
    "Hours_Student": [10, 15, 7, None, 12, 9, 14, 11, 8]
}

df = pd.DataFrame(data)
print(f"{separator} DataFrame: {separator}")
print(df)

____________________ DataFrame: ____________________
  Student_ID     Age  Score  Hours_Student
0         S1      20   85.0           10.0
1         S2      21    NaN           15.0
2         S3    None   78.0            7.0
3         S4      19   90.0            NaN
4         S5      22   88.0           12.0
5         S6  twenty   92.0            9.0
6         S7      23    NaN           14.0
7         S8      24   87.0           11.0
8         S9    None   80.0            8.0


In [2]:
print(f"{separator} Shape: {separator}")
print(df.shape)

____________________ Shape: ____________________
(9, 4)


In [3]:
print(f"{separator} Dataframe Summary: {separator}")
print(df.info())

____________________ Dataframe Summary: ____________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Student_ID     9 non-null      object 
 1   Age            7 non-null      object 
 2   Score          7 non-null      float64
 3   Hours_Student  8 non-null      float64
dtypes: float64(2), object(2)
memory usage: 420.0+ bytes
None


In [4]:
print(f"{separator} Missing Values: {separator}")
print(df.isnull().sum())

____________________ Missing Values: ____________________
Student_ID       0
Age              2
Score            2
Hours_Student    1
dtype: int64


In [5]:
print(f"{separator} Data types: {separator}")
print(df.dtypes)

____________________ Data types: ____________________
Student_ID        object
Age               object
Score            float64
Hours_Student    float64
dtype: object


In [6]:
# Data type conversion
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
print(f"{separator} Converting Age and Score Columns to numeric type: {separator}")
print(df.info())

____________________ Converting Age and Score Columns to numeric type: ____________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Student_ID     9 non-null      object 
 1   Age            6 non-null      float64
 2   Score          7 non-null      float64
 3   Hours_Student  8 non-null      float64
dtypes: float64(3), object(1)
memory usage: 420.0+ bytes
None


In [7]:
# Treating missing values
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Score"] = df["Score"].fillna(df["Score"].mean())
df["Hours_Student"] = df["Hours_Student"].fillna(df["Hours_Student"].mean())
print(f"{separator} Missing Values after treating missing values: {separator}")
print(df.isnull().sum())

____________________ Missing Values after treating missing values: ____________________
Student_ID       0
Age              0
Score            0
Hours_Student    0
dtype: int64
