In [8]:
import re
from ftfy import fix_text
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from docx import Document
import skills_extraction as skills_extraction

# Load dataset:
jd_df = pd.read_csv(r'structuredpublic.csv')

# Load the extracted resume skills:
file_path = r'CV.pdf'
skills = []
skills.append(' '.join(word for word in skills_extraction.skills_extractor(file_path)))

# Function to create ngrams
def ngrams(string, n=3):
    string = fix_text(string)  # fix text
    string = string.encode("ascii", errors="ignore").decode()  # remove non-ascii chars
    string = string.lower()
    chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()  # normalize case - capital at the start of each word
    string = re.sub(' +', ' ', string).strip()  # get rid of multiple spaces and replace with a single
    string = ' ' + string + ' '  # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
# tfidf = vectorizer.fit_transform(skills)
# jd_test = jd_df['Processed_JD'].values.astype('U')


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import joblib

# Assuming that your dataframe has a 'Labels' column that contains the labels for your data
labels = jd_df['Sector'].values.astype('U')

# Split the data into a training set and a testing set
jd_train, jd_test, labels_train, labels_test = train_test_split(jd_df['processed_JD'].values.astype('U'), labels, test_size=0.2, random_state=42)

# Transform the training and testing sets
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
jd_train_transformed = vectorizer.fit_transform(jd_train)
jd_test_transformed = vectorizer.transform(jd_test)

# Train the SVM model on the training set
svm = LinearSVC()
svm.fit(jd_train_transformed, labels_train)

# Save the model to a file
joblib.dump(svm, 'svm_model.pkl')

# Load the model from the file
svm_loaded = joblib.load('svm_model.pkl')

# Use the loaded model to make predictions on the testing set
labels_pred = svm_loaded.predict(jd_test_transformed)

# Build a confusion matrix
print(confusion_matrix(labels_test, labels_pred))
print(classification_report(labels_test, labels_pred))
print("Accuracy: ", accuracy_score(labels_test, labels_pred))




[[ 6  0  0  0  2  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  5  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 34  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1 26  0  0  0  0  0  4  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  2  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 10  0  0  4  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  2  0  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0 40  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1 10  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  4  0  0  0  0  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
################ naive bayes 
from sklearn.naive_bayes import MultinomialNB

# Train the Naive Bayes model on the training set
nb = MultinomialNB()
nb.fit(jd_train_transformed, labels_train)

# Save the model to a file
joblib.dump(nb, 'nb_model.pkl')

# Load the model from the file
nb_loaded = joblib.load('nb_model.pkl')

# Use the loaded model to make predictions on the testing set
labels_pred = nb_loaded.predict(jd_test_transformed)

# Build a confusion matrix
print(confusion_matrix(labels_test, labels_pred))
print(classification_report(labels_test, labels_pred))
print("Accuracy: ", accuracy_score(labels_test, labels_pred))


[[ 0  0  0  0  0  0  0  0  0  0  0 10  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0 33  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 31  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  4  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  4  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 41  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 12  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from sklearn.neighbors import KNeighborsClassifier

# Train the KNN model on the training set
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(jd_train_transformed, labels_train)

# Save the model to a file
joblib.dump(knn, 'knn_model.pkl')

# Load the model from the file
knn_loaded = joblib.load('knn_model.pkl')

# Use the loaded model to make predictions on the testing set
labels_pred = knn_loaded.predict(jd_test_transformed)

# Build a confusion matrix
print(confusion_matrix(labels_test, labels_pred))
print(classification_report(labels_test, labels_pred))
print("Accuracy: ", accuracy_score(labels_test, labels_pred))


[[ 3  0  0  0  3  3  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  2  0  0  0  0  0  3  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3  0  0  0 23  0  0  0  5  0  0  1  0  0  0  0  0  0  2  0  0  0  0]
 [ 2  0  0  1  2 17  0  0  2  0  0  2  1  0  0  0  0  0  4  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  8  0  0  4  0  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  7  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  1  8  0  0  9  0  0 19  1  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  1  0  0  1  8  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  1  0  0  0  0  3  0  0  0  0  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

# Assuming 'Sector' is the column you want to predict as a label
labels = jd_df['Sector']

# Split the data into training and testing sets (80-20 ratio)
X_train, X_test, y_train, y_test = train_test_split(tfidf, labels, test_size=0.2, random_state=42)

# Train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can choose the number of neighbors as needed
knn_model.fit(X_train, y_train)

# Predict on the testing set
y_pred = knn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print("Confusion Matrix:")
print(conf_matrix)


NameError: name 'tfidf' is not defined

In [13]:

import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import skills_extraction

# Load dataset 
jd_df = pd.read_csv('jd_structured_data.csv')

# Extract skills from resume

file_path = r'CV.pdf'
extracted_skills = []
extracted_skills.append(' '.join(word for word in skills_extraction.skills_extractor(file_path)))

# Vectorizer for skills
skills_vectorizer = TfidfVectorizer()
skills_vectors = skills_vectorizer.fit_transform(extracted_skills)

# Vectorize job descriptions 
jd_vectorizer = TfidfVectorizer()
jd_vectors = jd_vectorizer.fit_transform(jd_df['Processed_JD'])

# Compute cosine similarity  
similarity = cosine_similarity(jd_vectors, skills_vectors)
scores = similarity[0]

# Get top 5 most similar JDs
top_indices = scores.argsort()[-5:][::-1]
recommended_jds = jd_df.iloc[top_indices]

print('Recommended Jobs:')
print(recommended_jds['Job_Title'])


ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 11524 while Y.shape[1] == 12

In [5]:
import re
from ftfy import fix_text
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import joblib
import skills_extraction as skills_extraction

jd_df = pd.read_csv(r'jd_structured_data.csv')
file_path = r'CV.pdf'
skills = []
skills.append(' '.join(word for word in skills_extraction.skills_extractor(file_path)))

def ngrams(string, n=3):
    string = fix_text(string)  # fix text
    string = string.encode("ascii", errors="ignore").decode()  # remove non-ascii chars
    string = string.lower()
    chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()  # normalize case - capital at the start of each word
    string = re.sub(' +', ' ', string).strip()  # get rid of multiple spaces and replace with a single
    string = ' ' + string + ' '  # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]
    # ... same as your previous code ...

labels = jd_df['Sector'].values.astype('U')
jd_train, jd_test, labels_train, labels_test = train_test_split(jd_df['Processed_JD'].values.astype('U'), labels, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
jd_train_transformed = vectorizer.fit_transform(jd_train)
jd_test_transformed = vectorizer.transform(jd_test)

joblib.dump(vectorizer, 'vectorizer.pkl')

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(jd_train_transformed, labels_train)

joblib.dump(rfc, 'rfc_model.pkl')

rfc_loaded = joblib.load('rfc_model.pkl')
labels_pred = rfc_loaded.predict(jd_test_transformed)

print(confusion_matrix(labels_test, labels_pred))
print(classification_report(labels_test, labels_pred))
print("Accuracy: ", accuracy_score(labels_test, labels_pred))


[[ 2  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 28  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 14  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  5  0  0  0  4  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  8  4  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0 47  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  3 12  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  8  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
