In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, precision_score, recall_score

import os
import csv
from docx import Document

In [2]:
def convert_to_csv(directory):
    """This function will take word documents path as an argument and extracts text from the word document 
        and returns csv file with the extracted data
        Input: word files directory
        Output: output.csv file"""
    # List to store the converted data
    data = []

    # Iterate over files and folders in the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
        
            if file.endswith('.docx'):
                doc_path = os.path.join(root, file)
                doc = Document(doc_path)
        
                document_data = ''
        
                for paragraph in doc.paragraphs:
                    document_data += paragraph.text + ' '
        
                for table in doc.tables:
                    for row in table.rows:
                        for cell in row.cells:
                            document_data += cell.text + ' '
        
#                 data.append(document_data.strip())
                data.append([file, document_data])

    # Save the data to a CSV file
    csv_file = 'output.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['FileName', 'Resumes'])
        writer.writerows(data)

    print(f"Conversion completed. Converted data saved to {csv_file}.")

# Specify the directory containing the documents
directory = 'Dataset/Resumes'

# Call the function to convert the documents to CSV
convert_to_csv(directory)


Conversion completed. Converted data saved to output.csv.


In [23]:
convert_to_csv("Dataset/Test")

Conversion completed. Converted data saved to output.csv.


In [24]:
data = pd.read_csv("output.csv")
data

Unnamed: 0,FileName,Resumes
0,React Developer_Sarala Madasu-converted.docx," 204,Sri geethika prestige,road number 10,b..."


In [4]:
# Function to extract experience from the resume
def expDetails(Text):
    global sent
   
    Text = Text.split()
   
    for i in range(len(Text)-2):
        Text[i].lower()
        
        if Text[i] ==  'years':
            sent =  Text[i-2] + ' ' + Text[i-1] +' ' + Text[i] +' '+ Text[i+1] +' ' + Text[i+2]
            
            sent = re.sub('[^0-9.]', '', sent)
            sent = re.findall(r'\d+(?:\.\d+)?|\w+', sent)
            return (sent[0])

In [5]:
data['Experience(in_years)'] = data['Resumes'].apply(expDetails)
data['Experience(in_years)'] = pd.to_numeric(data['Experience(in_years)'], errors='coerce')
data['Experience(in_years)'].fillna(0.0, inplace=True)

In [6]:
data['Word_Count'] = data['Resumes'].apply(lambda x: len(str(x).split(" ")))
data.head()

Unnamed: 0,FileName,Resumes,Experience(in_years),Word_Count
0,React JS Developer_AnjaniPriyadarshini.docx,CURRICULUM VITAE I hereby declare that the...,9.0,1144
1,Reactjs Developer_Ranga Gaganam_Musquare Techn...,Ranga Gaganam Having 1+ years of success...,1.0,270
2,React Dev_Krishna Kanth_Musquare Technologies....,Ui-Developer/ React JS Developer NAME: KRISH...,3.2,410
3,Internship_Susovan Bag_Musquare Technologies.docx,SUSOVAN BAG Seeking a challenging po...,0.0,581
4,React JS Developer_KotaniDurgaprasad[3_1] (1)-...,Kotani Durga Prasad Objective: Aspirant for...,3.1,818


In [7]:
data['Char_Count'] = data['Resumes'].str.len()
data[['Resumes','Char_Count']].head()

Unnamed: 0,Resumes,Char_Count
0,CURRICULUM VITAE I hereby declare that the...,8014
1,Ranga Gaganam Having 1+ years of success...,1626
2,Ui-Developer/ React JS Developer NAME: KRISH...,3635
3,SUSOVAN BAG Seeking a challenging po...,1847
4,Kotani Durga Prasad Objective: Aspirant for...,5384


In [8]:
nltk.download('stopwords')
stop = stopwords.words('english')
data['Stopwords'] = data['Resumes'].apply(lambda x: len([x for x in x.split() if x in stop]))

[nltk_data] Downloading package stopwords to /home/aniket/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
data['Numerics'] = data['Resumes'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [10]:
def remove_underscore(text):
    """This function will take text as an input and 
        returns text in lower case by removing underscore from the text"""
    text = text.lower()
    text = re.sub('_', ' ', text)
    return text
def preprocess_text(text):
    text = text.lower()
    # Replace commas with spaces
    text = re.sub(r',', ' ', text)
    # Replace parentheses with spaces
    text = text.replace('(', ' ').replace(')', ' ')
    # Replace special characters with spaces
    text = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text)
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove single quotations using regular expressions
    text = re.sub(r"''", " ", text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [11]:
data["Resumes"] = data['Resumes'].apply(preprocess_text)
data['FileName'] = data['FileName'].apply(remove_underscore)

In [12]:
data['Label'] = np.where(data.FileName.str.contains("react developer"), "React JS Developer", 
                np.where(data.FileName.str.contains("react dev"), "React JS Developer",
                np.where(data.FileName.str.startswith("internship"), "SQL Developer", 
                np.where(data.FileName.str.contains("react js developer"), "React JS Developer",
                np.where(data.FileName.str.contains("reactjs developer"), "React JS Developer", 
                np.where(data.FileName.str.contains("peoplesoft"), "Peoplesoft", 
                np.where(data.FileName.str.contains("hexaware"), "Workday", 
                np.where(data.FileName.str.contains("heaware"), "Workday", "SQL Developer"))))))))

In [13]:
data['Label'].value_counts()

Label
React JS Developer    22
Workday               21
Peoplesoft            20
SQL Developer         16
Name: count, dtype: int64

In [14]:
from sklearn.preprocessing import LabelEncoder
Encoder=LabelEncoder()
data["LabelEncoding"]=Encoder.fit_transform(data ["Label"])

In [15]:
x = data['Resumes'].values
y = data['Label'].values
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=45, test_size=0.25,stratify=y)
x_train.shape, x_test.shape

((59,), (20,))

In [16]:
tfidf_vector = TfidfVectorizer(sublinear_tf=True,stop_words='english')
x_train = tfidf_vector.fit_transform(x_train)
x_test = tfidf_vector.transform(x_test)

In [17]:
# Random Forest Classifier
model_RF = RandomForestClassifier(n_estimators=200)
model_RF.fit(x_train, y_train)
y_pred = model_RF.predict(x_test)
accuracy_RF = accuracy_score(y_test, y_pred)
print('Accuracy of training set : {:.2f}'.format(model_RF.score(x_train, y_train)))
print('Accuracy of  test set    : {:.2f}'.format(model_RF.score(x_test, y_test)))
print("Classification report for classifier %s:\n%s\n" % (model_RF,classification_report(y_test, y_pred)))
nb_score = model_RF.score(x_test, y_test)
nb_cm = confusion_matrix(y_test, y_pred)


precision_RF = round(precision_score(y_test,y_pred,average = 'macro'),2)
recall_RF = round(recall_score(y_test,y_pred, average = 'macro'),2)
f1_RF = round(f1_score(y_test,y_pred, average = 'macro'),2)
accuracy_RF = round(accuracy_score(y_test,y_pred),2)

Accuracy of training set : 1.00
Accuracy of  test set    : 0.95
Classification report for classifier RandomForestClassifier(n_estimators=200):
                    precision    recall  f1-score   support

        Peoplesoft       1.00      1.00      1.00         5
React JS Developer       0.86      1.00      0.92         6
     SQL Developer       1.00      0.75      0.86         4
           Workday       1.00      1.00      1.00         5

          accuracy                           0.95        20
         macro avg       0.96      0.94      0.95        20
      weighted avg       0.96      0.95      0.95        20




In [20]:
x

array(['curriculum vitae hereby declare mentioned particular true knowledge place date anjani priyadarshini anjani priyadarshini sr web developer react personal detail dob 05 09 1985 sex female nationality indian city objective acquire position company would allow creative keep challenged various web project require employing latest trend technology employing passion multitude design work especially online design develop enable constantly learn develop varying area web development 9 year experience development web application using html 5 sas javascript frame work like angular 2 react j bootstrap materialize cs aspiring career professional organization apply varied thought fulfillment organization objective scale greater height along organization education completed ba kasturba gandhi college 2005 aggregate 6126 osmania university skill set skilled knowledgeable work experience project title quaqua product role srsoftware engineer team size 3 technology react html 5 cs 3 description qu

In [19]:
x_test

<20x3808 sparse matrix of type '<class 'numpy.float64'>'
	with 3973 stored elements in Compressed Sparse Row format>

In [26]:
import pickle
pickle.dump(model_RF,open('model.pkl','wb'))

In [27]:
pickle.dump(tfidf_vector,open('vector.pkl','wb'))