In [1]:
import pandas as pd

In [2]:
# Load the Dataset: 
# Import the dataset into a Python script and preview the data to understand its structure. 

df = pd.read_csv('trestle_academy_dataset.csv')

df.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,Female,Data Science,2023-03-22,97,No
1,S0002,Student_2,33,Female,Data Science,2023-01-29,64,No
2,S0003,Student_3,39,Female,Data Engineering,2023-12-24,97,No
3,S0004,Student_4,18,Male,Data Engineering,2023-01-09,89,Yes
4,S0005,Student_5,21,Female,Cloud Computing,2023-05-26,65,No


In [3]:
df.describe()

Unnamed: 0,age,final_grade
count,1000.0,1000.0
mean,30.731,75.766
std,7.857887,15.195004
min,18.0,50.0
25%,24.0,62.0
50%,31.0,76.0
75%,37.0,89.0
max,44.0,100.0


In [4]:
# Identify and Handle Missing Values: 
# Check for any missing values in the dataset and handle them by either filling with appropriate defaults or removing rows if necessary. 

df_cleaned = df.dropna()

df_cleaned.describe()

Unnamed: 0,age,final_grade
count,1000.0,1000.0
mean,30.731,75.766
std,7.857887,15.195004
min,18.0,50.0
25%,24.0,62.0
50%,31.0,76.0
75%,37.0,89.0
max,44.0,100.0


In [5]:
# Standardize data types
# Ensure all columns have the correct data types (e.g., age as integer, enrollment_date as date). 

# Convert 'age' to integer
df_cleaned['age'] = df_cleaned['age'].astype(int)

# Convert 'enrollment_date' to datetime
df_cleaned['enrollment_date'] = pd.to_datetime(df_cleaned['enrollment_date'])

# Convert 'final_grade' to integer
df_cleaned['final_grade'] = df_cleaned['final_grade'].astype(int)

# Convert 'gender' and 'course' to categorical
df_cleaned['gender'] = df_cleaned['gender'].astype('category')
df_cleaned['course'] = df_cleaned['course'].astype('category')

df_cleaned.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,Female,Data Science,2023-03-22,97,No
1,S0002,Student_2,33,Female,Data Science,2023-01-29,64,No
2,S0003,Student_3,39,Female,Data Engineering,2023-12-24,97,No
3,S0004,Student_4,18,Male,Data Engineering,2023-01-09,89,Yes
4,S0005,Student_5,21,Female,Cloud Computing,2023-05-26,65,No


In [6]:
# Normalize Text Data: 
# Standardize text data for consistency, such as making all course names lowercase or title case. 

df_cleaned['course'] = df_cleaned['course'].str.strip()        # Remove leading/trailing spaces
df_cleaned['course'] = df_cleaned['course'].str.lower()        # Convert to lowercase (or use .str.title() for title case)

# You can similarly standardize other text columns
df_cleaned['name'] = df_cleaned['name'].str.strip().str.title()  # Strip and convert to title case for 'name'
df_cleaned['gender'] = df_cleaned['gender'].str.strip().str.title()  # Standardize gender column

df_cleaned.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,Female,data science,2023-03-22,97,No
1,S0002,Student_2,33,Female,data science,2023-01-29,64,No
2,S0003,Student_3,39,Female,data engineering,2023-12-24,97,No
3,S0004,Student_4,18,Male,data engineering,2023-01-09,89,Yes
4,S0005,Student_5,21,Female,cloud computing,2023-05-26,65,No


In [7]:
# Filter Unwanted Data: 
# Filter out any rows where age is outside a realistic student range (e.g., 18-45).  

df_filtered = df_cleaned[(df_cleaned['age'] >= 18) & (df_cleaned['age'] <= 45)]

df_filtered.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,Female,data science,2023-03-22,97,No
1,S0002,Student_2,33,Female,data science,2023-01-29,64,No
2,S0003,Student_3,39,Female,data engineering,2023-12-24,97,No
3,S0004,Student_4,18,Male,data engineering,2023-01-09,89,Yes
4,S0005,Student_5,21,Female,cloud computing,2023-05-26,65,No


In [8]:
# Correct Inconsistent Entries: 

# For binary columns like is_intern, ensure values are standardized (e.g., "Yes" or "No"). 

df_filtered['is_intern'].unique()

array(['No', 'Yes'], dtype=object)

In [9]:
missing_values = df_filtered['is_intern'].isnull().sum()

print(missing_values)

0


In [11]:
# Save Cleaned Data: 

#Save the cleaned dataset to a new file, ready for ingestion

df_filtered.to_csv('cleaned_data_trestle_academy_dataset.csv', index=False)