In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib

In [3]:
df = pd.read_csv("student_dataset_400plus.csv")
df.head()

Unnamed: 0,ID,Name,Age,Gender,Math,Science,English,Programming,Communication,Creativity,Interest Area,Extra Activities,Preferred Subject,Career Goal
0,1,Aashish,17,M,89,92,78,8.5,6.0,7.0,Engineering,Robotics Club,Physics,Engineer
1,2,Sarita,16,F,85,88,90,7.8,8.5,8.0,Medicine,Debate Team,Biology,Doctor
2,3,Rohit,18,M,92,87,82,9.0,7.0,6.5,Computer Science,Chess Club,Programming,Software Developer
3,4,Anjali,17,F,78,80,85,6.5,7.5,9.0,Art,Drama Club,English,Graphic Designer
4,5,Kiran,16,M,90,85,79,8.0,6.5,7.5,Physics,Science Club,Physics,Physicist


In [4]:
df_cleaned = df.drop(columns=["ID", "Name"])
df_cleaned.head()

Unnamed: 0,Age,Gender,Math,Science,English,Programming,Communication,Creativity,Interest Area,Extra Activities,Preferred Subject,Career Goal
0,17,M,89,92,78,8.5,6.0,7.0,Engineering,Robotics Club,Physics,Engineer
1,16,F,85,88,90,7.8,8.5,8.0,Medicine,Debate Team,Biology,Doctor
2,18,M,92,87,82,9.0,7.0,6.5,Computer Science,Chess Club,Programming,Software Developer
3,17,F,78,80,85,6.5,7.5,9.0,Art,Drama Club,English,Graphic Designer
4,16,M,90,85,79,8.0,6.5,7.5,Physics,Science Club,Physics,Physicist


In [5]:
categoricals_cols = ["Gender", "Interest Area", "Extra Activities", "Preferred Subject", "Career Goal"]
label_encoders = {}
for col in categoricals_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le
df_cleaned.head()

Unnamed: 0,Age,Gender,Math,Science,English,Programming,Communication,Creativity,Interest Area,Extra Activities,Preferred Subject,Career Goal
0,17,1,89,92,78,8.5,6.0,7.0,4,12,5,7
1,16,0,85,88,90,7.8,8.5,8.0,7,4,1,6
2,18,1,92,87,82,9.0,7.0,6.5,2,2,6,14
3,17,0,78,80,85,6.5,7.5,9.0,0,5,3,9
4,16,1,90,85,79,8.0,6.5,7.5,8,13,5,12


# Peature X & Target y

In [6]:
X = df_cleaned.drop(columns=["Career Goal"])
y = df_cleaned["Career Goal"]

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=100)
model.fit(X_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         7
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         6
           7       1.00      1.00      1.00        19
           8       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         5
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         5
          13       1.00      1.00      1.00         9
          15       1.00      1.00      1.00         4
          16       1.00      1.00      1.00         2
          17       1.00      1.00      1.00         6

 

In [10]:
import joblib
joblib.dump(model,"career_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']

In [11]:
print("Done!")

Done!
