In [1]:
import re
import requests

import pandas as pd

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# Raw Data
jd_df = pd.read_csv('Bullets')
resume_df = pd.read_csv('Baseline Resume-Cleaned.csv')

In [3]:
resume_df.shape

(76, 2)

In [9]:
resume_df = resume_df.drop(['0'], axis=1)

ValueError: labels ['0'] not contained in axis

In [10]:
resume_df

Unnamed: 0,Phone:
0,Email:.@gmail.com
1,Reach me professionally at:
2,http://www.linkedin.com/pub/--mba-pmp/0/641/695
3,Tweet Me At:
4,@pmp4rpo
5,Chronological Career Progression
6,Data Scientist Student
7,General Assembly Data Science Immersive Program
8,2018
9,General Assembly provides courses in mobile an...


In [8]:
jd_df

Unnamed: 0,0
0,"Location, Location, Location - 1372 Peachtree ..."
1,Competitive compensation with incentive opport...
2,"Benefits include: Medical, Dental, Vision and ..."
3,Becoming an integral part of a winning team su...
4,Supportive leadership that recognizes and rewa...
5,Small company culture where everyone’s contrib...
6,Ability to participate in a constant offering ...
7,Casual work environment and team outings
8,"Experience joining a company at a high-growth,..."
9,Collaborate with product management and engine...


In [6]:
resume_df.shape

(76, 1)

In [7]:
resume_df

Unnamed: 0,Phone:
0,Email:.@gmail.com
1,Reach me professionally at:
2,http://www.linkedin.com/pub/--mba-pmp/0/641/695
3,Tweet Me At:
4,@pmp4rpo
5,Chronological Career Progression
6,Data Scientist Student
7,General Assembly Data Science Immersive Program
8,2018
9,General Assembly provides courses in mobile an...


In [24]:
jd_df.rename(index=str, columns={"0": "Bullets"}, inplace = True)
#jd_df = jd_df.drop(['job_descriptions'], axis=1)
jd_df['Class'] = 'Job Description'

In [33]:
jd_df.columns = ['text', 'class']

In [None]:
resume_df = pd.DataFrame(resume_df, columns=['resume'])

In [35]:
resume_df.columns = ['text', 'class']

In [36]:
# resume_df.rename(index=str, columns={"Phone: ": "Resume Desc"}, inplace = True)
# #jd_df = jd_df.drop(['job_descriptions'], axis=1)
# resume_df['Class'] = 'Baseline Resume'
# resume_df

In [37]:
combined_df = pd.concat([jd_df, resume_df], axis=0)

In [38]:
combined_df

Unnamed: 0,text,class
0,"Location, Location, Location - 1372 Peachtree ...",Job Description
1,Competitive compensation with incentive opport...,Job Description
2,"Benefits include: Medical, Dental, Vision and ...",Job Description
3,Becoming an integral part of a winning team su...,Job Description
4,Supportive leadership that recognizes and rewa...,Job Description
5,Small company culture where everyone’s contrib...,Job Description
6,Ability to participate in a constant offering ...,Job Description
7,Casual work environment and team outings,Job Description
8,"Experience joining a company at a high-growth,...",Job Description
9,Collaborate with product management and engine...,Job Description


In [46]:
combined_df['class'].value_counts(normalize=True)

Job Description    0.921162
Baseline Resume    0.078838
Name: class, dtype: float64

In [40]:
X = combined_df['text']
y = combined_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [41]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9820193637621023

In [42]:
pipe.score(X_test, y_test)

0.9087136929460581

In [43]:
test_report = pd.DataFrame({'sent': X_test, 'actual': y_test, 'preds': pipe.predict(X_test)})

In [49]:
pipe.predict_proba(X_test)

array([[4.92211600e-01, 5.07788400e-01],
       [4.06937062e-02, 9.59306294e-01],
       [2.22716923e-02, 9.77728308e-01],
       [3.22916718e-01, 6.77083282e-01],
       [1.12483842e-02, 9.88751616e-01],
       [1.00902381e-01, 8.99097619e-01],
       [3.62192658e-02, 9.63780734e-01],
       [6.56973186e-02, 9.34302681e-01],
       [1.83268328e-01, 8.16731672e-01],
       [6.99524452e-03, 9.93004755e-01],
       [4.08387583e-04, 9.99591612e-01],
       [9.53265626e-03, 9.90467344e-01],
       [1.48189586e-01, 8.51810414e-01],
       [7.54003438e-02, 9.24599656e-01],
       [3.09276410e-03, 9.96907236e-01],
       [1.08580494e-03, 9.98914195e-01],
       [2.07481737e-02, 9.79251826e-01],
       [2.26126841e-02, 9.77387316e-01],
       [9.91853107e-03, 9.90081469e-01],
       [2.28163114e-02, 9.77183689e-01],
       [2.88797037e-02, 9.71120296e-01],
       [8.31080712e-04, 9.99168919e-01],
       [3.17724018e-02, 9.68227598e-01],
       [2.40940236e-01, 7.59059764e-01],
       [6.922010

In [44]:
test_report.head()

Unnamed: 0,actual,preds,sent
29,Baseline Resume,Job Description,2014-2018
417,Job Description,Job Description,data analyst in related field: 2 years
235,Job Description,Job Description,"Identifies, evaluates, and implements emerging..."
53,Baseline Resume,Job Description,Client: Cummins Diesel Engines
815,Job Description,Job Description,Analyze and model both structured and unstruct...


In [59]:
combined_df.to_csv('Combined', index=False)

In [60]:
df4 = pd.read_csv('Combined')

In [61]:
df4

Unnamed: 0,text,class
0,"Location, Location, Location - 1372 Peachtree ...",Job Description
1,Competitive compensation with incentive opport...,Job Description
2,"Benefits include: Medical, Dental, Vision and ...",Job Description
3,Becoming an integral part of a winning team su...,Job Description
4,Supportive leadership that recognizes and rewa...,Job Description
5,Small company culture where everyone’s contrib...,Job Description
6,Ability to participate in a constant offering ...,Job Description
7,Casual work environment and team outings,Job Description
8,"Experience joining a company at a high-growth,...",Job Description
9,Collaborate with product management and engine...,Job Description
