# Import the necessary libraries and dataset

In [1]:
import pandas as pd
import nltk
import re
import docx

In [2]:
resume_df = pd.read_csv("UpdatedResumeDataSet.csv")

In [3]:
resume_df.head(40)

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
5,Data Science,"SKILLS C Basics, IOT, Python, MATLAB, Data Sci..."
6,Data Science,Skills â¢ Python â¢ Tableau â¢ Data Visuali...
7,Data Science,Education Details \r\n B.Tech Rayat and Bahr...
8,Data Science,Personal Skills â¢ Ability to quickly grasp t...
9,Data Science,Expertise â Data and Quantitative Analysis â...


# Data Preprocessing

In [4]:
# Remove unnecessary characters
resume_df['cleaned_resume'] = resume_df['Resume'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))

# Convert the text to lowercase
resume_df['cleaned_resume'] = resume_df['cleaned_resume'].apply(lambda x: x.lower())

# Remove stop words
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
resume_df['cleaned_resume'] = resume_df['cleaned_resume'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
resume_df

Unnamed: 0,Category,Resume,cleaned_resume
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may 2013 may 2017 uitrgpv da...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...
...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skills proficient ms office word basi...
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenges positive thinkin...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skills quick learner eagerness learn ...
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skills software knowledge mspower poi...


# Normalization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
features = cv.fit_transform(resume_df['cleaned_resume']).toarray()

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, resume_df['Category'], test_size=0.2, random_state=0)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)


MultinomialNB()

# Reading resume word file

In [8]:
doc = docx.Document('Jone cv.docx')
resume_text = '\n'.join([para.text for para in doc.paragraphs])
resume_text = re.sub(r'[^a-zA-Z0-9\s]', '', resume_text)
resume_text

'Full Name John Smith \nEmail \n Phone 123 4567890 \nAddress 123 Main St Anytown USA\nSummary Experienced Java Developer with over 5 years of experience in designing developing and testing applications using Java technologies Strong knowledge of ObjectOriented Programming and design patterns Proficient in various Java frameworks such as Spring Hibernate and Struts Passionate about software development and eager to learn new technologies\nSkills\nProgramming Languages Java SQL HTML CSS JavaScript\nFrameworks Spring Hibernate Struts\nDatabases MySQL Oracle PostgreSQL\nTools Eclipse IntelliJ IDEA Git Maven Jenkins\nOperating Systems Windows Linux Mac OS X\nExperience\nJava Developer ABC Company June 2019  Present\nDesign develop and maintain web applications using Java technologies such as Spring Hibernate and Struts\nCollaborate with team members to identify and resolve software defects\nDevelop and maintain SQL databases using MySQL and Oracle\nImplement software solutions that meet pro

# Predicting

In [9]:
test_data = [resume_text]
cleaned_test_data = [re.sub('[^a-zA-Z0-9\s]', '', x.lower()) for x in test_data]
cleaned_test_data = [' '.join([word for word in x.split() if word not in stop_words]) for x in cleaned_test_data]
test_features = cv.transform(cleaned_test_data).toarray()
predicted_category = clf.predict(test_features)
print(predicted_category)

['Java Developer']
