In [184]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [185]:
df = pd.read_csv('/content/drive/MyDrive/UpdatedResumeDataSet.csv')
df

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [186]:
df.shape

(962, 2)

In [187]:
unique_values = pd.unique(df['Category'])
unique_values

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [188]:
counts = {}
for item in df['Category']:
  if item not in counts:
    counts[item] = 1
  else:
    counts[item] += 1
counts

{'Advocate': 20,
 'Arts': 36,
 'Automation Testing': 26,
 'Blockchain': 40,
 'Business Analyst': 28,
 'Civil Engineer': 24,
 'Data Science': 40,
 'Database': 33,
 'DevOps Engineer': 55,
 'DotNet Developer': 28,
 'ETL Developer': 40,
 'Electrical Engineering': 30,
 'HR': 44,
 'Hadoop': 42,
 'Health and fitness': 30,
 'Java Developer': 84,
 'Mechanical Engineer': 40,
 'Network Security Engineer': 25,
 'Operations Manager': 40,
 'PMO': 30,
 'Python Developer': 48,
 'SAP Developer': 24,
 'Sales': 40,
 'Testing': 70,
 'Web Designing': 45}

In [189]:
df1 = pd.DataFrame(columns=['Category', 'Counts'])
df1['Category'] = counts.keys()
df1['Counts'] = counts.values()

sns.barplot(data=df1, x='Counts', y='Category')
plt.show()

<IPython.core.display.Javascript object>

In [190]:
import re
def clean(resumeText):
  resumeText = re.sub('http\S+\s*', ' ', resumeText)
  resumeText = re.sub('[^x00-x7f]', ' ', resumeText)
  resumeText = re.sub('#+', ' ', resumeText)
  resumeText = re.sub('@\S+', '', resumeText)
  resumeText = re.sub("[!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\]\^_`{\|}~]", ' ', resumeText)
  resumeText = re.sub('\s+', ' ', resumeText)
  # print(resumeText)
  return resumeText

df['CleanedResume'] = [clean(sentence) for sentence in df['Resume']]
print(df['CleanedResume'])

0      Skills Programming Languages P thon pandas num...
1      Education Details Ma 2013 to Ma 2017 B E UIT R...
2      Areas of Interest Deep Learning Control S stem...
3      Skills R P thon SAP HANA Tableau SAP HANA SQL ...
4      Education Details MCA YMCAUST Faridabad Har an...
                             ...                        
957    Computer Skills Proficient in MS office Word B...
958     Willingness to accept the challenges Positive...
959    PERSONAL SKILLS Quick learner Eagerness to lea...
960    COMPUTER SKILLS SOFTWARE KNOWLEDGE MS Power Po...
961    Skill Set OS Windows XP 7 8 8 1 10 Database MY...
Name: CleanedResume, Length: 962, dtype: object


In [191]:
df

Unnamed: 0,Category,Resume,CleanedResume
0,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages P thon pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details Ma 2013 to Ma 2017 B E UIT R...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control S stem...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R P thon SAP HANA Tableau SAP HANA SQL ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Har an...
...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,Computer Skills Proficient in MS office Word B...
958,Testing,â Willingness to accept the challenges. â ...,Willingness to accept the challenges Positive...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",PERSONAL SKILLS Quick learner Eagerness to lea...
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,COMPUTER SKILLS SOFTWARE KNOWLEDGE MS Power Po...


# Notice that this is a supervised learning (classification) problem.

We can use different kinds classifiers to tackle this task:

Desicion Tree, Naive Bayes and KNN

In [192]:
#Label Encoding Category column

from sklearn.preprocessing import LabelEncoder
var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
  df[i] = le.fit_transform(df[i])

In [193]:
corpus = []
for sent in df['CleanedResume'].tolist(): 
  temp = [','.join(sent.split(' '))]
  corpus.append(temp)
# print(corpus)

# Pre-processing 1 #Word2Vec

In [194]:
from gensim.models import Word2Vec

# corpus = df['CleanedResume'].tolist()

# Fit Word2Vec
model = Word2Vec(sentences=corpus, size=962, window=2, min_count=5, workers=8, sg=0)
model.save("w2v.model")
print(model)

Word2Vec(vocab=96, size=962, alpha=0.025)


In [212]:
# Visualize the model

from sklearn import manifold
# %matplotlib notebook

# ## word embedding
word_labels = model.wv.vocab
vectors = pd.DataFrame(columns=word_labels)
for item in word_labels:
  # print(item)
  vectors[item] = model[item]

# print(vectors)
pca = manifold.TSNE(perplexity=40, n_components=3, init='pca')
X = pca.fit_transform(vectors)
print(X)

## create dtf
dtf_ = pd.DataFrame(X, columns=["x","y","z"])
dtf_["input"] = 0
dtf_["input"].iloc[0:1] = 1 ## plot 3d

from mpl_toolkits.mplot3d import Axes3D

import plotly.express as px
fig = px.scatter_3d(dtf_, x='x', y='y', z='z')
fig.show()


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



[[ -45.529335     4.78171   -102.80437  ]
 [  49.29791     93.357956   -81.32874  ]
 [-104.697014    69.53335     -0.3920865]
 ...
 [ -48.506893   -24.477665    81.02308  ]
 [  27.48145     29.19241    -76.108    ]
 [  40.598137    51.269882   -38.65496  ]]




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [234]:
X_train,X_test,y_train,y_test = train_test_split(X,df['Category'].values, random_state=0, train_size=0.8)

from sklearn.multiclass import OneVsRestClassifier

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

from sklearn.neighbors import KNeighborsClassifier

neighbor = OneVsRestClassifier(KNeighborsClassifier())
neighbor.fit(X_train, y_train)
pred = neighbor.predict(X_test)

print('Training_set accuracy:', neighbor.score(X_train, y_train))
print('Testing_set accuracy:', neighbor.score(X_test, y_test))

(769, 3)
(769,)
(193, 3)
(193,)
Training_set accuracy: 0.2626788036410923
Testing_set accuracy: 0.031088082901554404


# Pre-processing 2 #TF-IDF

In [235]:
from sklearn.feature_extraction.text import TfidfVectorizer

requiredText = df['CleanedResume'].values
requiredTarget = df['Category'].values
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=2000)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(WordFeatures, df['Category'].values, random_state=0, train_size=0.8)
print(X_train_tf.shape)
print(y_train_tf.shape)
print(X_test_tf.shape)
print(y_test_.shape)

(769, 2000)
(769,)
(193, 2000)
(193,)


## 2.1| KNN

In [236]:
# X_traih_tf = X_train_tf.iloc[:, :3]
# y_train_tf = y_train_tf.iloc[:, :3]

neighbor_tf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=1))
neighbor_tf.fit(X_train_tf, y_train_tf)
pred = neighbor_tf.predict(X_test_tf)

print('Training_set accuracy:', neighbor_tf.score(X_train_tf, y_train_tf))
print('Testing_set accuracy:', neighbor_tf.score(X_test_tf, y_test_tf))

Training_set accuracy: 1.0
Testing_set accuracy: 1.0


## 2.2| Decision Tree

In [238]:
from sklearn import tree
dtree = tree.DecisionTreeClassifier()
dtree.fit(X_train_tf, y_train_tf)
pred_dtree = dtree.predict(X_test_tf)

print('Training_set accuracy:', dtree.score(X_train_tf, y_train_tf))
print('Testing_set accuracy:', dtree.score(X_test_tf, y_test_tf))

Training_set accuracy: 1.0
Testing_set accuracy: 1.0
