In [1]:
# === IMPORTS ===
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score, classification_report

from imblearn.over_sampling import RandomOverSampler
import joblib

RANDOM_STATE = 42

# === Artifacts directory ===
ARTIFACT_DIR = "kanasu_model_artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
ARTIFACT_DIR


'kanasu_model_artifacts'

In [2]:
from google.colab import files

print("Upload your training CSV:")
uploaded = files.upload()
CSV_PATH = list(uploaded.keys())[0]
df = pd.read_csv(CSV_PATH)

print("Loaded", CSV_PATH)
df.head()


Upload your training CSV:


Saving kanasu_synth_17500.csv to kanasu_synth_17500.csv
Loaded kanasu_synth_17500.csv


Unnamed: 0,id,age,education,stream_code,interests,skills,gender,title_code,title_label,cluster_code,cluster_label
0,row_14281,22,iti,mechanical_iti,"hands_on,ncc_defence,tailoring,teaching","crafting,critical_thinking,hand_eye_coordinati...",male,tailor,Tailor,skilled_trades,Skilled Trades & Vocational
1,row_04563,19,ug,bca,"chemistry_experiments,drawing,health_fitness","analytical,critical_thinking,research_skills",other,pharmacist,Pharmacist,healthcare,Healthcare & Medicine
2,row_15256,26,iti,electroplating_iti,"debate,event_planning,hotel_management,travel_...","budgeting,customer_service,leadership,planning",other,hotel_manager,Hotel Manager,hospitality,Hospitality & Tourism
3,row_10924,22,ug,be,"civic_issues,discipline,electrical_work,sports","discipline,fitness_strength",female,police_officer,Police Officer,law_public,"Law, Civil & Public Services"
4,row_04668,31,ug,bba,"chemistry_experiments,environment_science,heal...","analytical,research_skills",male,pharmacist,Pharmacist,healthcare,Healthcare & Medicine


In [3]:
required_cols = ['id','age','education','stream_code','interests','skills',
                 'gender','title_code','title_label','cluster_code','cluster_label']

for c in required_cols:
    if c not in df.columns:
        raise ValueError(f"Missing column: {c}")

df['interests'] = df['interests'].fillna('').str.replace(',', ' ')
df['skills'] = df['skills'].fillna('').str.replace(',', ' ')
df['text'] = (df['interests'] + " " + df['skills']).str.strip()
df['age'] = pd.to_numeric(df['age'], errors='coerce').fillna(df['age'].median()).astype(int)

df.head()


Unnamed: 0,id,age,education,stream_code,interests,skills,gender,title_code,title_label,cluster_code,cluster_label,text
0,row_14281,22,iti,mechanical_iti,hands_on ncc_defence tailoring teaching,crafting critical_thinking hand_eye_coordinati...,male,tailor,Tailor,skilled_trades,Skilled Trades & Vocational,hands_on ncc_defence tailoring teaching crafti...
1,row_04563,19,ug,bca,chemistry_experiments drawing health_fitness,analytical critical_thinking research_skills,other,pharmacist,Pharmacist,healthcare,Healthcare & Medicine,chemistry_experiments drawing health_fitness a...
2,row_15256,26,iti,electroplating_iti,debate event_planning hotel_management travel_...,budgeting customer_service leadership planning,other,hotel_manager,Hotel Manager,hospitality,Hospitality & Tourism,debate event_planning hotel_management travel_...
3,row_10924,22,ug,be,civic_issues discipline electrical_work sports,discipline fitness_strength,female,police_officer,Police Officer,law_public,"Law, Civil & Public Services",civic_issues discipline electrical_work sports...
4,row_04668,31,ug,bba,chemistry_experiments environment_science heal...,analytical research_skills,male,pharmacist,Pharmacist,healthcare,Healthcare & Medicine,chemistry_experiments environment_science heal...


In [4]:
trainval_df, test_df = train_test_split(
    df, test_size=0.15, random_state=RANDOM_STATE, stratify=df['title_code']
)
train_df, val_df = train_test_split(
    trainval_df, test_size=0.17647, random_state=RANDOM_STATE, stratify=trainval_df['title_code']
)

len(train_df), len(val_df), len(test_df)


(12250, 2625, 2625)

In [5]:
TEXT_COL = 'text'
CAT_COLS = ['education','stream_code','gender']
NUM_COLS = ['age']

tfidf = TfidfVectorizer(
    max_features=1500,
    ngram_range=(1,2),
    min_df=3
)

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", tfidf, TEXT_COL),
        ("ohe", ohe, CAT_COLS),
    ],
    remainder="passthrough"
)


In [6]:
X_train = train_df[[TEXT_COL] + CAT_COLS + NUM_COLS]
y_train = train_df['title_code']

X_val = val_df[[TEXT_COL] + CAT_COLS + NUM_COLS]
y_val = val_df['title_code']

X_test = test_df[[TEXT_COL] + CAT_COLS + NUM_COLS]
y_test = test_df['title_code']


In [7]:
preprocessor.fit(X_train)

X_train_mat = preprocessor.transform(X_train)
X_val_mat = preprocessor.transform(X_val)
X_test_mat = preprocessor.transform(X_test)

ros = RandomOverSampler(random_state=RANDOM_STATE)
X_train_bal, y_train_bal = ros.fit_resample(X_train_mat, y_train)

X_train_mat.shape, X_train_bal.shape


((12250, 1534), (12267, 1534))

In [8]:
rf = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=RANDOM_STATE
)

rf.fit(X_train_bal, y_train_bal)
print("RandomForest training complete.")


RandomForest training complete.


In [9]:
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", rf),
])

model_pipeline


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [10]:
preds = model_pipeline.predict(X_test)
probs = model_pipeline.predict_proba(X_test)

acc_top1 = accuracy_score(y_test, preds)
acc_top3 = top_k_accuracy_score(y_test, probs, k=3)

acc_top1, acc_top3


(0.8868571428571429, np.float64(1.0))

In [11]:
# Save pipeline
PIPE_PATH = os.path.join(ARTIFACT_DIR, "model.pkl")
joblib.dump(model_pipeline, PIPE_PATH)
print("Saved:", PIPE_PATH)

# Save class labels
CLASSES_PATH = os.path.join(ARTIFACT_DIR, "title_classes.npy")
np.save(CLASSES_PATH, model_pipeline.named_steps["classifier"].classes_)
print("Saved:", CLASSES_PATH)

# Save title->cluster mapping
mapping = df[['title_code','title_label','cluster_code','cluster_label']].drop_duplicates()
mapping = mapping.set_index('title_code').to_dict(orient='index')

MAPPING_PATH = os.path.join(ARTIFACT_DIR, "title_to_cluster.json")
with open(MAPPING_PATH, 'w', encoding='utf-8') as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)

print("Saved:", MAPPING_PATH)


Saved: kanasu_model_artifacts/model.pkl
Saved: kanasu_model_artifacts/title_classes.npy
Saved: kanasu_model_artifacts/title_to_cluster.json


In [12]:
sample = pd.DataFrame([{
    "text": "coding robotics programming",
    "education": "puc",
    "stream_code": "pcmc",
    "gender": "male",
    "age": 18
}])

probs = model_pipeline.predict_proba(sample)[0]
classes = model_pipeline.named_steps["classifier"].classes_
top3 = np.argsort(probs)[-3:][::-1]

for i in top3:
    print(classes[i], "-->", round(probs[i]*100, 2), "%")


ml_engineer --> 35.0 %
freelancer --> 8.33 %
mobile_app_developer --> 8.0 %


In [13]:
from google.colab import files
files.download(PIPE_PATH)
files.download(MAPPING_PATH)
files.download(CLASSES_PATH)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>