<a href="https://colab.research.google.com/github/cheonghf/ML-P4-03/blob/main/V2_Project_SourceCode_P4_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset ID
dataset_id = "d_3c55210de27fcccda2ed0c63fdd2b352"

# API URL for the dataset
dataset_url = f"https://data.gov.sg/api/action/datastore_search?resource_id={dataset_id}&limit=5000"

# Request data from the API
response = requests.get(dataset_url)
response.raise_for_status()  # Ensure the request was successful

# Convert response to JSON
data = response.json()

# Extract records
if 'result' in data and 'records' in data['result']:
    df_ges1 = pd.DataFrame(data['result']['records'])
    print(df_ges1.shape)  # Print shape to verify
else:
    print("No records found in the dataset.")

# dataset_1 = 'https://raw.githubusercontent.com/cheonghf/ML-P4-03/refs/heads/main/GraduateEmploymentSurveyNTUNUSSITSMUSUSSSUTD.csv'
# df_ges1 = pd.read_csv(dataset_1)

In [20]:
# Define clusters based on degree names
def classify_degree(degree):
    engineering_keywords = ['Engineering', 'Material Science', 'Mechanical', 'Electrical', 'Civil', 'Aerospace', 'Bioengineering']
    it_keywords = ['Computing', 'Computer Science', 'Information Systems', 'Software', 'Cybersecurity', 'Data Science']
    science_keywords = ['Science', 'Physics', 'Mathematics', 'Chemistry', 'Biological', 'Pharmacy', 'Life Sciences']
    business_keywords = ['Business', 'Finance', 'Accountancy', 'Economics', 'Management', 'Marketing']

    degree_lower = degree.lower()

    if any(keyword.lower() in degree_lower for keyword in engineering_keywords):
        return 'Engineering'
    elif any(keyword.lower() in degree_lower for keyword in it_keywords):
        return 'Information Technology'
    elif any(keyword.lower() in degree_lower for keyword in science_keywords):
        return 'Science'
    elif any(keyword.lower() in degree_lower for keyword in business_keywords):
        return 'Business'
    else:
        return 'Others'

# Apply classification to the dataset
df_ges1['degree_cluster'] = df_ges1['degree'].apply(classify_degree)

#AFTER

In [None]:
# Replace the string 'na' with actual NaN (np.nan)
df_ges1.replace('na', np.nan, inplace=True)

# Count NaN values in each column
na_counts = df_ges1.isna().sum()

# Print the result
print(na_counts)

In [16]:
# Remove rows with any NaN values
df_ges1.dropna()

# Copy ges1 DataFrame into another DataFrame
df_clean_ges = df_ges1[:]

In [None]:
columns_to_convert = [
    "employment_rate_overall", "employment_rate_ft_perm",
    "basic_monthly_mean", "basic_monthly_median",
    "gross_monthly_mean", "gross_monthly_median",
    "gross_mthly_25_percentile", "gross_mthly_75_percentile"
]

for column in columns_to_convert:
    # Replace commas or other problematic characters if needed
    df_clean_ges[column] = df_clean_ges[column].astype(str).str.replace(',', '')

    # Remove leading/trailing whitespace
    df_clean_ges[column] = df_clean_ges[column].str.strip()

    # Convert to numeric
    df_clean_ges[column] = pd.to_numeric(df_clean_ges[column], errors='coerce')

# Check if conversion worked
print(df_clean_ges.dtypes)

In [21]:
#Extract NTU and NUS DataFrames respectively
df_clean_ges_ntu = df_clean_ges.loc[df_clean_ges['university'] == 'Nanyang Technological University']
df_clean_ges_nus = df_clean_ges.loc[df_clean_ges['university'] == 'National University of Singapore']

# Group by year and school cluster (college level)
df_ntu_grouped = df_clean_ges_ntu.groupby(['year', 'university', 'school']).mean(numeric_only=True).reset_index()
df_nus_grouped = df_clean_ges_nus.groupby(['year', 'university', 'school']).mean(numeric_only=True).reset_index()

In [19]:
df_ntu_grouped.school.value_counts()

Unnamed: 0_level_0,count
school,Unnamed: 1_level_1
College of Business (Nanyang Business School),10
College of Engineering,10
National Institute of Education (NIE),10
"College of Humanities, Arts & Social Sciences",9
College of Sciences,7
Sports Science and Management,6
Lee Kong Chian School of Medicine,5
College of Science,3


In [20]:
df_nus_grouped.school.value_counts()

Unnamed: 0_level_0,count
school,Unnamed: 1_level_1
Faculty of Arts & Social Sciences,10
Faculty of Law,10
Faculty of Science,10
NUS Business School,10
School of Computing,10
School of Design & Environment,9
Faculty of Dentistry,8
Faculty of Engineering,7
Yale-NUS College,6
Yong Loo Lin School (Medicine),6


In [7]:
df_nus_grouped.dtypes

Unnamed: 0,0
year,int64
university,object
school,object
employment_rate_overall,float64
employment_rate_ft_perm,float64
basic_monthly_mean,float64
basic_monthly_median,float64
gross_monthly_mean,float64
gross_monthly_median,float64
gross_mthly_25_percentile,float64
