#Building and testing classification models to predict salaries from the text contained in the job descriptions

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import pylab
import pandas as pd
from pandas import Series, DataFrame
import re
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
import csv
import os
from sklearn.feature_extraction.text import CountVectorizer

## Preprocessing

In [84]:
#Create a string of all descriptions (name it all_descripts):
jobs = pd.read_csv('Train_rev1.csv')

In [91]:
jobs=jobs[:10000]

In [92]:
descripts = jobs['FullDescription']
all_descripts = ""
for i in descripts:
    all_descripts += i + " "

In [93]:
#Make a tokenized list of substrings of only lower-case letters:
letters_only = re.sub('[^a-zA-Z]', ' ', all_descripts)
letters_only_lowered = letters_only.lower()
tokenized_all_descripts = word_tokenize(letters_only_lowered)

### Parts of Speech Tags for Each Token

In [94]:
#Make a list of pos tags:
pos_tags = nltk.pos_tag(tokenized_all_descripts)
pos_tags = pd.Series(pos_tags)
pos_tags_list = []
for i in pos_tags:
    pos_tags_list.append(i[1])

In [95]:
#top 5 most common parts of speech:
pos_tags_series = pd.Series(pos_tags_list)
most_common_pos = pos_tags_series.value_counts()
most_common_pos[:5]

NN     710292
IN     260269
NNS    203837
DT     194845
JJ     191808
dtype: int64

In [96]:
#And their frequecies in the corpus:
most_common_pos_freq = most_common_pos/len(tokenized_all_descripts)
most_common_pos_freq[:5]

NN     0.302417
IN     0.110813
NNS    0.086787
DT     0.082958
JJ     0.081665
dtype: float64

### Experimental v. Theoretical Zipf's Law

In [97]:
#Value counts of each token, and their rank:
descripts_series = pd.Series(tokenized_all_descripts)
most_common = descripts_series.value_counts()
ranked_most_common = most_common.rank(ascending=False, method='min')

In [98]:
#Theoretical value counts and ranks using Zipf's Law:
s = np.random.zipf(2, len(most_common))
s = pd.Series(s)
theoretical_counts = s.value_counts()
ranked_theoretical_counts = theoretical_counts.rank(ascending=False, method='min')

In [99]:
#Plot experimental and theoretical using log-log (creates a pop-up):
plt.scatter(np.log(ranked_most_common), np.log(most_common), color='red')
plt.scatter(np.log(ranked_theoretical_counts), np.log(theoretical_counts))
plt.xlabel('Rank')
plt.ylabel('Count')
plt.title('Theoretical(blue) v. Experimental(red)')
plt.show()

### Remove Stopwords and Lemmatize:

In [100]:
stops = stopwords.words('english')
no_stop_words = [i for i in tokenized_all_descripts if i not in stops]
#stop words gone, just need to lemmatize

In [101]:
lemmatizer = WordNetLemmatizer()
lemmatized_list = []
for i in no_stop_words:
    lemmatized_list.append(lemmatizer.lemmatize(i))
lemmatized_series = pd.Series(lemmatized_list)
lemmatized_most_common = lemmatized_series.value_counts()
lemmatized_most_common[:10]
#list of 10 most common tokens after stop word removal and lemmatization

experience    17507
work          12382
care          12064
role          11848
client        11691
team          11139
working       10049
service        9624
manager        9292
within         9249
dtype: int64

In [102]:
#And their frequencies in the corpus:
lemmatized_most_common_freq = lemmatized_most_common/len(tokenized_all_descripts)
lemmatized_most_common_freq[:10]

experience    0.007454
work          0.005272
care          0.005136
role          0.005044
client        0.004978
team          0.004743
working       0.004279
service       0.004098
manager       0.003956
within        0.003938
dtype: float64

##Classification

In [2]:
jobs = pd.read_csv('Train_rev1.csv')

In [3]:
jobs=jobs[:30000]

In [4]:
#Create a data frame of 2 columns: (1) target (2) full descriptions
df = pd.DataFrame(jobs)

In [5]:
df = df[['Id','FullDescription','SalaryNormalized']]

In [6]:
salary_percentile = pd.qcut(df['SalaryNormalized'],
                           4,
                           labels = ['0-25', '25-50', '50-75', '75-100'])

In [7]:
df['SalaryPercentile'] = salary_percentile

In [8]:
df['target'] = 0.0

In [9]:
mask = df['SalaryPercentile'] == '75-100'

In [10]:
df.ix[mask, 'target'] = 1.0

In [11]:
def lower_letters_only(string):
    return re.sub('[^a-zA-Z]', ' ', string).lower()

In [12]:
df['FullDescription'] = df['FullDescription'].map(lower_letters_only)

In [13]:
df = df.reindex(np.random.permutation(df.index))

In [14]:
df2 = df[['target', 'FullDescription']]

In [16]:
#write it to a csv:
df2.to_csv('C:/Users/Jace/Downloads/jobs_full_30000.csv', header=False, index=False)

###Split training data into 50/50 pos/neg class

In [17]:
neg_mask = df2['target'] ==0

In [18]:
df3=df2[neg_mask]

In [20]:
df3.to_csv('C:/Users/Jace/Downloads/jobs_neg_class_data.csv', header=False, index=False)

In [21]:
pos_mask = df2['target'] ==1

In [22]:
df4=df2[pos_mask]

In [24]:
df4.to_csv('C:/Users/Jace/Downloads/jobs_pos_class_data.csv', header=False, index=False)

Now we have 2 csv's, one is all positive class, one is all negative. Now, use them to make the data and labels lists:

In [25]:
jobs_pos = open('C:/Users/Jace/Downloads/jobs_pos_class_data.csv')

In [26]:
jobs_pos_data = []

In [27]:
jobs_pos_labels = []

In [28]:
csv_reader_pos = csv.reader(jobs_pos)

In [29]:
for line in csv_reader_pos:
    jobs_pos_labels.append(int(float(line[0])))
    jobs_pos_data.append(line[1])

In [30]:
jobs_pos.close()

In [31]:
jobs_neg = open('C:/Users/Jace/Downloads/jobs_neg_class_data.csv')

In [32]:
jobs_neg_data = []

In [33]:
jobs_neg_labels = []

In [34]:
csv_reader_neg = csv.reader(jobs_neg)

In [35]:
for line in csv_reader_neg:
    jobs_neg_labels.append(int(float(line[0])))
    jobs_neg_data.append(line[1])

In [36]:
jobs_neg_data = jobs_neg_data[:6918]

In [37]:
jobs_neg_labels = jobs_neg_labels[:6918]

In [38]:
jobs_neg.close()

In [39]:
jobs_data = jobs_neg_data + jobs_pos_data

In [40]:
jobs_labels = jobs_neg_labels + jobs_pos_labels

In [41]:
from random import shuffle
jobs_data_shuf = []
jobs_labels_shuf = []
index_shuf = range(len(jobs_data))
shuffle(index_shuf)
for i in index_shuf:
    jobs_data_shuf.append(jobs_data[i])
    jobs_labels_shuf.append(jobs_labels[i])

In [42]:
jobs_data = jobs_data_shuf

In [43]:
jobs_labels = jobs_labels_shuf

### Multinomial Naive Bayes Classifier 

In [44]:
#Create train and test sets (could have also used train-test split):
trainset_size = int(round(len(jobs_data)*0.60))
X_train = np.array([''.join(el) for el in jobs_data[0:trainset_size]])

In [45]:
y_train = np.array([el for el in jobs_labels[0:trainset_size]])
X_test = np.array([''.join(el) for el in jobs_data[trainset_size+1:len(jobs_data)]])
y_test = np.array([el for el in jobs_labels[trainset_size+1:len(jobs_labels)]])

In [46]:
#define vectorizer and create the DTM matrices:
vectorizer = CountVectorizer()
dtm_train = vectorizer.fit_transform(X_train)
dtm_test = vectorizer.transform(X_test)

In [47]:
#train the classifier:
nb_classifier = MultinomialNB().fit(dtm_train, y_train)

In [48]:
#run the classifier to predict target labels:
y_nb_predicted = nb_classifier.predict(dtm_test)

In [49]:
print "confusion matrix:"
metrics.confusion_matrix(y_test, y_nb_predicted, labels = np.unique(jobs_labels))

confusion matrix:


array([[2156,  628],
       [ 521, 2228]])

In [50]:
print 'Classification Accuracy:'
metrics.accuracy_score(y_test, y_nb_predicted)

Classification Accuracy:


0.79233688776432321

### Lemmatize. Then vectorize, fit and run again:

Yes, we do think that lemmatization will help the classification accuracy because it will reflect the emphasis a given description may have on a certain topic (not necesarily just one unique word). In some specific cases this may actually hurt because these different (but similar) words should be treated as unique, but we think overall it should help.

In [51]:
lemmatizer = WordNetLemmatizer()

In [52]:
def lemmatized_tokens(tokens, lemmatizer):
    lemmatized = []
    for item in tokens:
        lemmatized.append(lemmatizer.lemmatize(item))
    return lemmatized

In [53]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = lemmatized_tokens(tokens, lemmatizer)
    return stems

In [54]:
#Run vectorizer with a custom tokenizer that lemmatizes:
lem_vectorizer = CountVectorizer(tokenizer=tokenize)

In [55]:
#Create DTM matrices:
dtm_train2 = lem_vectorizer.fit_transform(X_train)

In [56]:
dtm_test2 = lem_vectorizer.transform(X_test)

In [57]:
#fit model:
nb_classifier = MultinomialNB().fit(dtm_train2, y_train)

In [58]:
#run model to predict target labels:
y_nb_predicted = nb_classifier.predict(dtm_test2)

In [59]:
metrics.accuracy_score(y_test, y_nb_predicted)

0.78492680281944693

The classification accuracy actually decreased after lemmatization. This is not what we expected. Although we did not anticipate a large increase in accuracy from lemmatization, we did anticipate some increase. Apparently the lemmatization nullified some important distinctions between similar words, whereas these distinctions were upheld in B1.

###Stopword removal:
Use the original (un-lemmatized) data

We anticipate that removing stopwords will increase the accuracy of the model because the stopwords likely do not help in predicting what salary level a given job description corresponds to. Therefore, keeping them in the model may lead to overfitting in some cases and thus produce the wrong classification.

In [60]:
#Create stopword vectorizer:
sw_vectorizer = CountVectorizer(stop_words = 'english')

In [61]:
#Create the DTM matrices using the stopword vectorizer:
dtm_train3 = sw_vectorizer.fit_transform(X_train)

In [62]:
dtm_test3 = sw_vectorizer.transform(X_test)

In [63]:
#Fit the classifier on the training DTM:
nb_classifier = MultinomialNB().fit(dtm_train3, y_train)

In [64]:
#Run classifier to predict target labels:
y_nb_predicted = nb_classifier.predict(dtm_test3)

In [65]:
metrics.accuracy_score(y_test, y_nb_predicted)

0.79504789445147295

Stopword removal helped a little bit here, but not by much. The accuracy improved by about .4 percent over the original model.

### Parts of Speech Bigrams

In [72]:
def get_tokens_and_pos_bigrams(string):
    '''This function accepts a string and returns a list 
    of the pos_bigrams of tokens in the string.'''
    
    tokenized = word_tokenize(string)
    pos_tuples = nltk.pos_tag(tokenized)
    pos_tags = []
    for (word, tag) in pos_tuples:
        pos_tags.append(tag)
    pos_bigrams = list(nltk.bigrams(pos_tags))
    return tokenized + pos_bigrams

In [74]:
pos_bigram_vectorizer = CountVectorizer(tokenizer = get_tokens_and_pos_bigrams)

In [76]:
dtm_train4 = pos_bigram_vectorizer.fit_transform(X_train)

In [79]:
dtm_test4 = pos_bigram_vectorizer.transform(X_test)

In [80]:
nb_classifier = MultinomialNB().fit(dtm_train4, y_train)

In [81]:
y_nb_predicted = nb_classifier.predict(dtm_test4)

In [82]:
metrics.accuracy_score(y_test, y_nb_predicted)

0.7869148743900235

Including the parts of speech bigrams did not help the accuracy of our model.