In [1]:
# Step 0: imports + file path
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, accuracy_score, f1_score,
                             mean_absolute_error, mean_squared_error, r2_score)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression

FILE_PATH = "C:/Users/User/Desktop/Capstone Project/Dataset/cs_students.csv"   


In [2]:
# Step 1: load & inspect
df = pd.read_csv(FILE_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head(8))


Shape: (180, 12)
Columns: ['Student ID', 'Name', 'Gender', 'Age', 'GPA', 'Major', 'Interested Domain', 'Projects', 'Future Career', 'Python', 'SQL', 'Java']


Unnamed: 0,Student ID,Name,Gender,Age,GPA,Major,Interested Domain,Projects,Future Career,Python,SQL,Java
0,1,John Smith,Male,21,3.5,Computer Science,Artificial Intelligence,Chatbot Development,Machine Learning Researcher,Strong,Strong,Weak
1,2,Alice Johnson,Female,20,3.2,Computer Science,Data Science,Data Analytics,Data Scientist,Average,Strong,Weak
2,3,Robert Davis,Male,22,3.8,Computer Science,Software Development,E-commerce Website,Software Engineer,Strong,Strong,Average
3,4,Emily Wilson,Female,21,3.7,Computer Science,Web Development,Full-Stack Web App,Web Developer,Weak,Strong,Strong
4,5,Michael Brown,Male,23,3.4,Computer Science,Cybersecurity,Network Security,Information Security Analyst,Average,Weak,Strong
5,6,Laura Lee,Female,22,3.9,Computer Science,Machine Learning,Image Recognition,Machine Learning Engineer,Strong,Average,Weak
6,7,William Johnson,Male,20,3.6,Computer Science,Database Management,SQL Query Optimization,Database Administrator,Average,Strong,Weak
7,8,Sarah Miller,Female,21,3.7,Computer Science,Cloud Computing,AWS Deployment,Cloud Solutions Architect,Weak,Strong,Average


In [3]:
print(df.columns.tolist())

['Student ID', 'Name', 'Gender', 'Age', 'GPA', 'Major', 'Interested Domain', 'Projects', 'Future Career', 'Python', 'SQL', 'Java']


In [4]:
# Step 2: drop id/name/constant columns
to_drop = [c for c in ['Student ID', 'Name', 'Major'] if c in df.columns]
if to_drop:
    df = df.drop(columns=to_drop)
    print("Dropped:", to_drop)
print("Columns now:", df.columns.tolist())


Dropped: ['Student ID', 'Name', 'Major']
Columns now: ['Gender', 'Age', 'GPA', 'Interested Domain', 'Projects', 'Future Career', 'Python', 'SQL', 'Java']


In [5]:
# Step 3: map Strong/Average/Weak -> 2/1/0
level_map = {'Strong': 2, 'Average': 1, 'Weak': 0, 'strong':2, 'average':1, 'weak':0}
for col in ['Python','SQL','Java']:
    if col in df.columns:
        df[col + '_num'] = df[col].map(level_map).fillna(0).astype(int)
        print(f"Created {col+'_num'}")


Created Python_num
Created SQL_num
Created Java_num


In [6]:
# Step 4: top project binaries
if 'Projects' in df.columns:
    topN = 8
    top_projects = df['Projects'].value_counts().nlargest(topN).index.tolist()
    for p in top_projects:
        col = 'proj_' + p.replace(' ', '_').replace('/', '_')[:40]
        df[col] = (df['Projects'] == p).astype(int)
    df['proj_other'] = (~df['Projects'].isin(top_projects)).astype(int)
    print("Top projects (binary) created:", top_projects)


Top projects (binary) created: ['Network Security', 'Natural Language Processing', 'E-commerce Website', 'SQL Query Optimization', 'AWS Deployment', 'Machine Learning', 'Full-Stack Web App', 'Social Media Platform']


In [7]:
# Step 5: one-hot encode Gender and top Interested Domain
features = []

# Age
if 'Age' in df.columns:
    features.append('Age')

# skill numeric columns
for c in ['Python_num','SQL_num','Java_num']:
    if c in df.columns:
        features.append(c)

# project binary columns (from step 4)
proj_cols = [c for c in df.columns if c.startswith('proj_')]
features += proj_cols

# Gender one-hot
if 'Gender' in df.columns:
    gender_dummies = pd.get_dummies(df['Gender'], prefix='gender')
    df = pd.concat([df, gender_dummies], axis=1)
    features += list(gender_dummies.columns)

# Interested Domain -> top 12 one-hot + other
if 'Interested Domain' in df.columns:
    topK = 12
    top_domains = df['Interested Domain'].value_counts().nlargest(topK).index.tolist()
    dom_keep = pd.get_dummies(df['Interested Domain']).loc[:, top_domains]
    dom_keep.columns = ['domain_' + str(c).replace(' ', '_') for c in dom_keep.columns]
    df = pd.concat([df, dom_keep], axis=1)
    df['domain_other'] = (~df['Interested Domain'].isin(top_domains)).astype(int)
    features += list(dom_keep.columns) + ['domain_other']

print("Feature candidates count:", len(features))
print("Some feature names:", features[:30])


Feature candidates count: 28
Some feature names: ['Age', 'Python_num', 'SQL_num', 'Java_num', 'proj_Network_Security', 'proj_Natural_Language_Processing', 'proj_E-commerce_Website', 'proj_SQL_Query_Optimization', 'proj_AWS_Deployment', 'proj_Machine_Learning', 'proj_Full-Stack_Web_App', 'proj_Social_Media_Platform', 'proj_other', 'gender_Female', 'gender_Male', 'domain_Artificial_Intelligence', 'domain_Web_Development', 'domain_Cybersecurity', 'domain_Mobile_App_Development', 'domain_Database_Management', 'domain_Cloud_Computing', 'domain_Data_Science', 'domain_Computer_Graphics', 'domain_Machine_Learning', 'domain_Software_Development', 'domain_Software_Engineering', 'domain_Bioinformatics', 'domain_other']


In [8]:
# Step 6: define targets
target_clf = 'Future Career'   # classification target
target_reg = 'GPA'             # regression target

# build X from features list (fill missing with 0)
X = df[features].fillna(0)
print("X shape:", X.shape)

y_clf = df[target_clf] if target_clf in df.columns else None
y_reg = df[target_reg] if target_reg in df.columns else None

print("Have classification target?", y_clf is not None)
print("Have regression target?", y_reg is not None)


X shape: (180, 28)
Have classification target? True
Have regression target? True


In [9]:
# Step 7: classification split
if y_clf is not None:
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
        )
    except Exception:
        X_train, X_test, y_train, y_test = train_test_split(X, y_clf, test_size=0.2, random_state=42)
    print("Train/test shapes:", X_train.shape, X_test.shape)
else:
    print("No classification target found.")


Train/test shapes: (144, 28) (36, 28)


In [10]:
# Step 8: train classifier(s)
if y_clf is not None:
    # Decision Tree
    dt_clf = DecisionTreeClassifier(max_depth=6, random_state=42)
    dt_clf.fit(X_train, y_train)
    dt_pred = dt_clf.predict(X_test)

    print("Decision Tree — Accuracy:", accuracy_score(y_test, dt_pred))
    print("Decision Tree — F1 (macro):", f1_score(y_test, dt_pred, average='macro'))
    print("\nDecision Tree classification report:\n", classification_report(y_test, dt_pred, zero_division=0))

    # Random Forest baseline
    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_clf.fit(X_train, y_train)
    rf_pred = rf_clf.predict(X_test)

    print("Random Forest — Accuracy:", accuracy_score(y_test, rf_pred))
    print("Random Forest — F1 (macro):", f1_score(y_test, rf_pred, average='macro'))
    print("\nRandom Forest classification report:\n", classification_report(y_test, rf_pred, zero_division=0))


Decision Tree — Accuracy: 0.4444444444444444
Decision Tree — F1 (macro): 0.2823529411764706

Decision Tree classification report:
                               precision    recall  f1-score   support

               AI Researcher       0.00      0.00      0.00         1
            Bioinformatician       0.00      0.00      0.00         1
   Cloud Solutions Architect       1.00      1.00      1.00         5
                Data Analyst       0.00      0.00      0.00         1
              Data Scientist       0.00      0.00      0.00         1
      Database Administrator       1.00      1.00      1.00         2
Distributed Systems Engineer       0.00      0.00      0.00         1
              Ethical Hacker       0.00      0.00      0.00         1
              Game Developer       0.00      0.00      0.00         1
         Graphics Programmer       0.00      0.00      0.00         1
    Healthcare IT Specialist       0.00      0.00      0.00         1
Information Security Analyst

In [11]:
# Step 9: regression split
if y_reg is not None:
    Xr_train, Xr_test, yr_train, yr_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)
    print("Regression train/test shapes:", Xr_train.shape, Xr_test.shape)
else:
    print("No regression target found.")


Regression train/test shapes: (144, 28) (36, 28)


In [13]:
# Step 10: regressors
if y_reg is not None:
    # Decision Tree Regressor
    dt_reg = DecisionTreeRegressor(max_depth=6, random_state=42)
    dt_reg.fit(Xr_train, yr_train)
    pred_dt_r = dt_reg.predict(Xr_test)
    print("Decision Tree Regressor MAE:", mean_absolute_error(yr_test, pred_dt_r))
    # Calculate RMSE by taking square root of MSE
    print("Decision Tree Regressor RMSE:", np.sqrt(mean_squared_error(yr_test, pred_dt_r)))
    print("Decision Tree Regressor R2:", r2_score(yr_test, pred_dt_r))

    # Random Forest Regressor
    rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_reg.fit(Xr_train, yr_train)
    pred_rf_r = rf_reg.predict(Xr_test)
    print("Random Forest Regressor MAE:", mean_absolute_error(yr_test, pred_rf_r))
    # Calculate RMSE by taking square root of MSE
    print("Random Forest Regressor RMSE:", np.sqrt(mean_squared_error(yr_test, pred_rf_r)))
    print("Random Forest Regressor R2:", r2_score(yr_test, pred_rf_r))

Decision Tree Regressor MAE: 0.11827160493827164
Decision Tree Regressor RMSE: 0.1622405493841833
Decision Tree Regressor R2: 0.09152312542295238
Random Forest Regressor MAE: 0.09864623015873118
Random Forest Regressor RMSE: 0.1497156317867752
Random Forest Regressor R2: 0.22637703222098715


In [14]:
# Step 11: save models for later use
joblib.dump(dt_clf, "dt_classifier.joblib")
joblib.dump(rf_clf, "rf_classifier.joblib")
joblib.dump(dt_reg, "dt_regressor.joblib")
joblib.dump(rf_reg, "rf_regressor.joblib")
print("Saved model files to current notebook folder.")


Saved model files to current notebook folder.


In [None]:
knn

In [15]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)  # you can tune k=3,5,7,...
knn_clf.fit(X_train, y_train)
pred_knn = knn_clf.predict(X_test)

print("KNN Classifier — Accuracy:", accuracy_score(y_test, pred_knn))
print("KNN Classifier — F1 (macro):", f1_score(y_test, pred_knn, average='macro'))
print("\nKNN classification report:\n", classification_report(y_test, pred_knn, zero_division=0))


KNN Classifier — Accuracy: 0.6666666666666666
KNN Classifier — F1 (macro): 0.43006535947712415

KNN classification report:
                               precision    recall  f1-score   support

               AI Researcher       0.00      0.00      0.00         1
            Bioinformatician       0.00      0.00      0.00         1
   Cloud Solutions Architect       0.57      0.80      0.67         5
                Data Analyst       0.00      0.00      0.00         1
              Data Scientist       1.00      1.00      1.00         1
      Database Administrator       1.00      0.50      0.67         2
Digital Forensics Specialist       0.00      0.00      0.00         0
Distributed Systems Engineer       0.00      0.00      0.00         1
              Ethical Hacker       0.00      0.00      0.00         1
              Game Developer       1.00      1.00      1.00         1
         Graphics Programmer       0.25      1.00      0.40         1
    Healthcare IT Specialist       

In [17]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
# Use California housing dataset instead of Boston
from sklearn.datasets import fetch_california_housing  # Alternative dataset

# Load the California housing dataset instead of Boston
# This dataset is ethically sound and currently supported by scikit-learn
housing = fetch_california_housing()
X = housing.data  # Features
y = housing.target  # Target variable

# Create train and test sets
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X, y, test_size=0.2, random_state=42)

# KNN Regressor
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(Xr_train, yr_train)
pred_knn_r = knn_reg.predict(Xr_test)

# Evaluate the model
print("KNN Regressor — MAE:", mean_absolute_error(yr_test, pred_knn_r))
# Calculate RMSE manually by taking the square root of MSE
print("KNN Regressor — RMSE:", np.sqrt(mean_squared_error(yr_test, pred_knn_r)))
print("KNN Regressor — R2:", r2_score(yr_test, pred_knn_r))

KNN Regressor — MAE: 0.8127975600775195
KNN Regressor — RMSE: 1.0576778270706204
KNN Regressor — R2: 0.14631049965900345
