In [2]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras import metrics
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.layers import Flatten

from nltk import word_tokenize, pos_tag, chunk
from pprint import pprint
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from pprint import pprint
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [3]:
#Load the entire dataset
data = pd.read_csv('../../Results/JobsDataset.csv', header = 0, names = ['Query', 'Job Title', 'Description'])

In [4]:
## Data cleaning
#Create Job description list
job_descriptions=[]
for job in data.Description:
    j = job.replace(',', '')
    job_descriptions.append(j)
    
#Words tokenization
jobs = [word_tokenize(d) for d in job_descriptions]

#Remove Capitalization
no_capitals =[]
for job in jobs:
    no_capitals.append([j.lower() for j in job])

#Lemmatize
lemmatizer = WordNetLemmatizer()
lem=[]
for job in no_capitals:
    lem.append([lemmatizer.lemmatize(j) for j in job])

#Remove stopwords
filtered_words = []
for job in lem:
    filtered_words.append([j for j in job if not j in stopwords.words('english')])

#Remove symbols
cleaned_description=[]
for job in filtered_words:
    cleaned_description.append([j for j in job if not j in ['(',')','.',',',':','%']])

#Final cleaned description list                                                            
cleaned_desc=[]
for description in cleaned_description:
    cleaned_desc.append(" ".join(description))

                                                            
#create new df 
df = pd.DataFrame({'Query':list(data.Query),'Description':cleaned_desc})


In [5]:
train, test = train_test_split(df, test_size = 0.2)

train_descs = train['Description']
train_labels = train['Query']
#train_labels = train['Job Title']
 
test_descs = test['Description']
test_labels = test['Query']

In [10]:
num_labels = len(train_labels.unique().tolist())
vocab_size = 1000
batch_size = 200
nb_epoch = 30

# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_descs)
x_train = tokenizer.texts_to_matrix(train_descs, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_descs, mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

In [7]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adam', # or 'sgd'
              metrics = [metrics.categorical_accuracy, 'accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               512512    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 25)                12825     
__________

In [8]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=nb_epoch,
                    verbose=1,
                    validation_split=0.1)

Train on 7200 samples, validate on 800 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [9]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('\nTest categorical_crossentropy:', score[0])
print('Categorical accuracy:', score[1])
print('Accuracy:', score[2])



Test categorical_crossentropy: 1.8923237562179565
Categorical accuracy: 0.6169999957084655
Accuracy: 0.6169999957084655
