In [1]:
import pandas as pd
import re
import string
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:

# Load Dataset
df = pd.read_csv("C:/Users/Acer/Desktop/Talaba,Ephraim/ARSwithPredictiveAnalytics/resume-dataset/Resume/Resume.csv")  # Update with the correct filename

# Drop rows with missing values
df.dropna(inplace=True)

# Define stopwords
stop_words = set(stopwords.words('english'))

import inflect

# Initialize the number-to-word converter
p = inflect.engine()

def convert_numbers_to_words(text):
    words = text.split()
    converted_words = [p.number_to_words(word) if word.isdigit() else word for word in words]
    return " ".join(converted_words)

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, punctuation, and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation

    # Convert numbers to words
    text = convert_numbers_to_words(text)

    # Tokenization
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    return " ".join(words)


# Apply preprocessing to resume text
df["Resume_str"] = df["Resume_str"].apply(preprocess_text)

In [12]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("C:/Users/Acer/Desktop/Talaba,Ephraim/ARSwithPredictiveAnalytics/resume-dataset/Resume/Resume.csv")

# Encode job roles
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df["Job Roles"])

# Fix missing class labels by re-mapping to sequential values
unique_classes = np.unique(y_encoded)
class_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_classes)}
y = np.array([class_mapping[label] for label in y_encoded])  # Re-map labels

# Convert text data to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["Resume_str"]).toarray()

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost Classifier with corrected number of classes
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(unique_classes), eval_metric="mlogloss")

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)


In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Fix classification report issue
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=label_encoder.classes_[:len(np.unique(y_test))]))


Accuracy: 0.7565

Classification Report:
                         precision    recall  f1-score   support

            ACCOUNTANT       0.76      0.90      0.82        31
              ADVOCATE       0.81      0.81      0.81        31
           AGRICULTURE       0.80      0.36      0.50        11
               APPAREL       0.50      0.39      0.44        18
             ARCHITECT       0.73      0.33      0.46        24
                  ARTS       0.50      0.29      0.36         7
            AUTOMOBILE       0.74      0.91      0.82        22
              AVIATION       0.72      0.62      0.67        21
               BANKING       0.50      0.50      0.50         2
                   BPO       0.65      0.74      0.69        23
  BUSINESS-DEVELOPMENT       0.88      0.92      0.90        24
                  CHEF       0.86      0.80      0.83        30
          CONSTRUCTION       0.50      0.60      0.55        15
            CONSULTANT       0.83      1.00      0.90        

In [15]:
import pickle

# Save the XGBoost model
with open('xgb_model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)

# Save the TF-IDF vectorizer
with open('tfidf.pkl', 'wb') as tfidf_file:
    pickle.dump(tfidf, tfidf_file)

# Save the Label Encoder
with open('label_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

print("Model, TF-IDF vectorizer, and Label Encoder saved successfully!")


Model, TF-IDF vectorizer, and Label Encoder saved successfully!
