In [7]:
# Step 0
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, accuracy_score, f1_score,
                             mean_absolute_error, mean_squared_error, r2_score)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set this to your CSV path:
FILE_PATH = "C:/Users/User/Desktop/Capstone Project/Dataset/AI-based Career Recommendation System.csv"


In [8]:
# Step 1
df = pd.read_csv(FILE_PATH)
print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())


Loaded shape: (200, 8)
Columns: ['CandidateID', 'Name', 'Age', 'Education', 'Skills', 'Interests', 'Recommended_Career', 'Recommendation_Score']


Unnamed: 0,CandidateID,Name,Age,Education,Skills,Interests,Recommended_Career,Recommendation_Score
0,1,John Doe,28,Bachelor's,Python;Data Analysis;Machine Learning,Technology;Data Science,Data Scientist,0.95
1,2,Jane Smith,32,Master's,Java;System Design;Cloud Computing,Software Development;AI,Software Engineer,0.9
2,3,Bob Johnson,24,Bachelor's,Graphic Design;UI/UX;Adobe Creative Suite,Arts;Digital Media,UX Designer,0.88
3,4,Emily Davis,26,Bachelor's,Python;Deep Learning;Statistics,Healthcare;AI,AI Researcher,0.93
4,5,Michael Brown,30,Master's,Project Management;Communication;Agile,Business;Management,Project Manager,0.87


In [29]:
print(df.columns.tolist())

['Age', 'Recommended_Career', 'Recommendation_Score', 'skill_.NET', 'skill_AI', 'skill_Adobe Creative Suite', 'skill_Adobe Illustrator', 'skill_Adobe Photoshop', 'skill_Adobe XD', 'skill_Agile', 'skill_Algorithms', 'skill_Analytics', 'skill_Android', 'skill_Automation', 'skill_Big Data', 'skill_Business Analysis', 'skill_C#', 'skill_C++', 'skill_CRM', 'skill_CSS', 'skill_Cloud Computing', 'skill_Communication', 'skill_Content Creation', 'skill_Content Strategy', 'skill_Content Writing', 'skill_Copywriting', 'skill_Creativity', 'skill_Cybersecurity', 'skill_Data Analysis', 'skill_Data Mining', 'skill_Data Science', 'skill_Data Structures', 'skill_Data Visualization', 'skill_Data Warehousing', 'skill_Deep Learning', 'skill_DevOps', 'skill_Digital Illustration', 'skill_Digital Marketing', 'skill_ETL', 'skill_Econometrics', 'skill_Embedded Systems', 'skill_Excel', 'skill_Financial Analysis', 'skill_Graphic Design', 'skill_HTML', 'skill_Illustration', 'skill_Interaction Design', 'skill_IoT'

In [30]:
# Step 2
to_drop = [c for c in ['CandidateID', 'Name', 'ID'] if c in df.columns]
if to_drop:
    df = df.drop(columns=to_drop)
    print("Dropped:", to_drop)
print("Columns now:", df.columns.tolist())


Columns now: ['Age', 'Recommended_Career', 'Recommendation_Score', 'skill_.NET', 'skill_AI', 'skill_Adobe Creative Suite', 'skill_Adobe Illustrator', 'skill_Adobe Photoshop', 'skill_Adobe XD', 'skill_Agile', 'skill_Algorithms', 'skill_Analytics', 'skill_Android', 'skill_Automation', 'skill_Big Data', 'skill_Business Analysis', 'skill_C#', 'skill_C++', 'skill_CRM', 'skill_CSS', 'skill_Cloud Computing', 'skill_Communication', 'skill_Content Creation', 'skill_Content Strategy', 'skill_Content Writing', 'skill_Copywriting', 'skill_Creativity', 'skill_Cybersecurity', 'skill_Data Analysis', 'skill_Data Mining', 'skill_Data Science', 'skill_Data Structures', 'skill_Data Visualization', 'skill_Data Warehousing', 'skill_Deep Learning', 'skill_DevOps', 'skill_Digital Illustration', 'skill_Digital Marketing', 'skill_ETL', 'skill_Econometrics', 'skill_Embedded Systems', 'skill_Excel', 'skill_Financial Analysis', 'skill_Graphic Design', 'skill_HTML', 'skill_Illustration', 'skill_Interaction Design'

In [32]:
# Step 3
def split_to_lists(series, sep=';'):
    return series.fillna('').astype(str).apply(lambda s: [t.strip() for t in s.split(sep) if t.strip()])

if 'Skills' in df.columns:
    skills_lists = split_to_lists(df['Skills'], sep=';')
    mlb_sk = MultiLabelBinarizer()
    skills_encoded = mlb_sk.fit_transform(skills_lists)
    skills_df = pd.DataFrame(skills_encoded, columns=['skill_'+c for c in mlb_sk.classes_], index=df.index)
    df = pd.concat([df.drop(columns=['Skills']), skills_df], axis=1)
    print("Added skill columns:", len(skills_df.columns))
else:
    print("No 'Skills' column found - skipped.")


No 'Skills' column found - skipped.


In [11]:
# Step 4
if 'Interests' in df.columns:
    interests_lists = split_to_lists(df['Interests'], sep=';')
    mlb_int = MultiLabelBinarizer()
    interests_encoded = mlb_int.fit_transform(interests_lists)
    interests_df = pd.DataFrame(interests_encoded, columns=['interest_'+c for c in mlb_int.classes_], index=df.index)
    df = pd.concat([df.drop(columns=['Interests']), interests_df], axis=1)
    print("Added interest columns:", len(interests_df.columns))
else:
    print("No 'Interests' column found - skipped.")


Added interest columns: 38


In [12]:
# Step 5
if 'Education' in df.columns:
    edu_dummies = pd.get_dummies(df['Education'], prefix='edu', dummy_na=False)
    df = pd.concat([df.drop(columns=['Education']), edu_dummies], axis=1)
    print("Added education dummies:", edu_dummies.columns.tolist())
else:
    print("No 'Education' column found - skipped.")


Added education dummies: ["edu_Bachelor's", "edu_Master's", 'edu_PhD']


In [13]:
# Step 6
target_clf = 'Recommended_Career'
target_reg = 'Recommendation_Score'

X = df.drop(columns=[c for c in [target_clf, target_reg] if c in df.columns], errors='ignore')
y_clf = df[target_clf] if target_clf in df.columns else None
y_reg = df[target_reg] if target_reg in df.columns else None

print("Features shape:", X.shape)
if y_clf is not None:
    print("Classification target unique classes:", y_clf.nunique())
if y_reg is not None:
    print("Regression target stats:\n", y_reg.describe())


Features shape: (200, 122)
Classification target unique classes: 32
Regression target stats:
 count    200.000000
mean       0.896450
std        0.028915
min        0.850000
25%        0.870000
50%        0.900000
75%        0.920000
max        0.950000
Name: Recommendation_Score, dtype: float64


In [14]:
# Step 7
if y_clf is not None:
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
        )
    except Exception:
        X_train, X_test, y_train, y_test = train_test_split(X, y_clf, test_size=0.2, random_state=42)
    print("Train/Test shapes:", X_train.shape, X_test.shape)
else:
    print("No classification target found; skipping split.")


Train/Test shapes: (160, 122) (40, 122)


In [15]:
# Step 8
if y_clf is not None:
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 (macro):", f1_score(y_test, y_pred, average='macro'))
    print("\nClassification report:\n", classification_report(y_test, y_pred))
else:
    print("Skipping classification training.")


Accuracy: 0.45
F1 (macro): 0.30625

Classification report:
                            precision    recall  f1-score   support

            AI Researcher       1.00      1.00      1.00         1
            AI Specialist       1.00      1.00      1.00         1
      Automation Engineer       0.00      0.00      0.00         1
        Backend Developer       0.50      1.00      0.67         1
          Biostatistician       0.50      1.00      0.67         1
         Business Analyst       0.00      0.00      0.00         1
           Cloud Engineer       0.00      0.00      0.00         1
       Content Strategist       0.00      0.00      0.00         1
    Cybersecurity Analyst       0.25      1.00      0.40         1
 Cybersecurity Specialist       0.00      0.00      0.00         1
             Data Analyst       0.50      0.75      0.60         4
            Data Engineer       0.00      0.00      0.00         1
           Data Scientist       0.00      0.00      0.00         1
 

In [16]:
# Step 9
if y_clf is not None:
    numeric_cols = ['Age'] if 'Age' in X.columns else []
    preproc = ColumnTransformer([('scale_age', StandardScaler(), numeric_cols)], remainder='passthrough')
    log_pipe = Pipeline([('preproc', preproc), ('clf', LogisticRegression(max_iter=1000, random_state=42))])
    log_pipe.fit(X_train, y_train)
    y_pred_log = log_pipe.predict(X_test)
    print("LogisticRegression Accuracy:", accuracy_score(y_test, y_pred_log))
else:
    print("Skipping logistic baseline.")


LogisticRegression Accuracy: 0.4


In [17]:
# Step 10
if y_reg is not None:
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)
    print("Regression Train/Test shapes:", X_train_r.shape, X_test_r.shape)
else:
    print("No regression target found; skipping regression split.")


Regression Train/Test shapes: (160, 122) (40, 122)


In [19]:
# Step 11
if y_reg is not None:
    reg = RandomForestRegressor(n_estimators=100, random_state=42)
    reg.fit(X_train_r, y_train_r)
    y_pred_r = reg.predict(X_test_r)

    print("MAE:", mean_absolute_error(y_test_r, y_pred_r))
    # Calculate RMSE manually by taking the square root of MSE
    print("RMSE:", np.sqrt(mean_squared_error(y_test_r, y_pred_r)))
    print("R2:", r2_score(y_test_r, y_pred_r))
else:
    print("Skipping regression training.")

MAE: 0.021784053571428534
RMSE: 0.027736440387203395
R2: -0.10147311012493931


In [20]:
# Step 12
if y_reg is not None:
    lin = LinearRegression()
    lin.fit(X_train_r, y_train_r)
    print("Linear Regression R2:", r2_score(y_test_r, lin.predict(X_test_r)))
else:
    print("Skipping linear regression.")


Linear Regression R2: 0.12693615389103252


In [21]:
# Step 13
if 'clf' in globals():
    joblib.dump(clf, "rf_classifier.joblib")
    print("Saved rf_classifier.joblib")
if 'reg' in globals():
    joblib.dump(reg, "rf_regressor.joblib")
    print("Saved rf_regressor.joblib")
if 'log_pipe' in globals():
    joblib.dump(log_pipe, "logistic_pipeline.joblib")
    print("Saved logistic_pipeline.joblib")


Saved rf_classifier.joblib
Saved rf_regressor.joblib
Saved logistic_pipeline.joblib


In [None]:
KNN


In [33]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)  # you can tune k=3,5,7,...
knn_clf.fit(X_train, y_train)
pred_knn = knn_clf.predict(X_test)

print("KNN Classifier — Accuracy:", accuracy_score(y_test, pred_knn))
print("KNN Classifier — F1 (macro):", f1_score(y_test, pred_knn, average='macro'))
print("\nKNN classification report:\n", classification_report(y_test, pred_knn, zero_division=0))


KNN Classifier — Accuracy: 0.2
KNN Classifier — F1 (macro): 0.11316964285714284

KNN classification report:
                            precision    recall  f1-score   support

            AI Researcher       0.50      1.00      0.67         1
            AI Specialist       0.00      0.00      0.00         1
      Automation Engineer       0.00      0.00      0.00         1
        Backend Developer       0.00      0.00      0.00         1
          Biostatistician       0.50      1.00      0.67         1
         Business Analyst       0.00      0.00      0.00         1
           Cloud Engineer       0.00      0.00      0.00         1
       Content Strategist       0.00      0.00      0.00         1
    Cybersecurity Analyst       0.00      0.00      0.00         1
 Cybersecurity Specialist       0.00      0.00      0.00         1
             Data Analyst       0.25      0.25      0.25         4
            Data Engineer       0.00      0.00      0.00         1
           Data Sci

In [38]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
# Use California housing dataset instead of Boston
from sklearn.datasets import fetch_california_housing  # Alternative dataset

# Load the California housing dataset instead of Boston
# This dataset is ethically sound and currently supported by scikit-learn
housing = fetch_california_housing()
X = housing.data  # Features
y = housing.target  # Target variable

# Create train and test sets
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X, y, test_size=0.2, random_state=42)

# KNN Regressor
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(Xr_train, yr_train)
pred_knn_r = knn_reg.predict(Xr_test)

# Evaluate the model
print("KNN Regressor — MAE:", mean_absolute_error(yr_test, pred_knn_r))
# Calculate RMSE manually by taking the square root of MSE
print("KNN Regressor — RMSE:", np.sqrt(mean_squared_error(yr_test, pred_knn_r)))
print("KNN Regressor — R2:", r2_score(yr_test, pred_knn_r))

KNN Regressor — MAE: 0.8127975600775195
KNN Regressor — RMSE: 1.0576778270706204
KNN Regressor — R2: 0.14631049965900345
