In [None]:

# Install only what's necessary
!pip install -q pandas scikit-learn joblib nltk

import pandas as pd
import joblib
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')


In [None]:

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    return " ".join(words)


In [None]:

# Load and preprocess dataset
df = pd.read_csv("dataset.csv")
df["resume_text"] = df["resume_text"].astype(str).apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(df["resume_text"], df["job_role"], test_size=0.2, random_state=42)


In [None]:

# Train model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])
pipeline.fit(X_train, y_train)


In [None]:

# Save the trained model
joblib.dump(pipeline, "trained_job_role_model.pkl")
print("Pickle file saved as trained_job_role_model.pkl")
