In [1]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras import metrics
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.layers import Flatten

from nltk import word_tokenize, pos_tag, chunk
from pprint import pprint
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from pprint import pprint
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Load the entire dataset
data = pd.read_csv('../../Results/JobsDataset.csv', header = 0, names = ['Query', 'Job Title', 'Description'])

In [3]:
## Data cleaning
#Create Job description list
job_descriptions=[]
for job in data.Description:
    j = job.replace(',', '')
    job_descriptions.append(j)
    
#Words tokenization
jobs = [word_tokenize(d) for d in job_descriptions]

#Remove Capitalization
no_capitals =[]
for job in jobs:
    no_capitals.append([j.lower() for j in job])

#Lemmatize
lemmatizer = WordNetLemmatizer()
lem=[]
for job in no_capitals:
    lem.append([lemmatizer.lemmatize(j) for j in job])

#Remove stopwords
filtered_words = []
for job in lem:
    filtered_words.append([j for j in job if not j in stopwords.words('english')])

#Remove symbols
cleaned_description=[]
for job in filtered_words:
    cleaned_description.append([j for j in job if not j in ['(',')','.',',',':','%']])

#Final cleaned description list                                                            
cleaned_desc=[]
for description in cleaned_description:
    cleaned_desc.append(" ".join(description))

                                                            
#create new df 
df = pd.DataFrame({'Query':list(data.Query),'Description':cleaned_desc})


In [73]:
#Create csv 
df.to_csv('../../Results/25_cleaned_job_descriptions.csv')

In [5]:
#Split data to train and test (80 - 20)

train, test = train_test_split(df, test_size = 0.2)

train_descs = train['Description']
train_labels = train['Query']
#train_labels = train['Job Title']
 
test_descs = test['Description']
test_labels = test['Query']
#test_labels = test['Job Title']

In [6]:
##Parameters
#Encoding
vocab_size = 500
max_length = 500

#Model
num_labels = 25
embedding_dimensios = 20
nb_epoch = 30
batch_size = 100

In [7]:
###Training Data
# Encode the jobs descriptions
encoded_docs = [one_hot(d, vocab_size) for d in train_descs]
# pad documents to a max length
x_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
#Binarize the job titles
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)


###Test Data
# Encode the jobs descriptions
encoded_docs = [one_hot(d, vocab_size) for d in test_descs]
# pad documents to a max length
x_test = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
#Binarize the job titles
encoder = LabelBinarizer()
encoder.fit(test_labels)
y_test = encoder.transform(test_labels)

In [8]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dimensios, input_length=max_length))
model.add(Flatten())
model.add(Dense(num_labels))
model.add(Activation('softmax'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 20)           10000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 25)                250025    
_________________________________________________________________
activation_1 (Activation)    (None, 25)                0         
Total params: 260,025
Trainable params: 260,025
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adam', # or 'sgd'
              metrics = [metrics.categorical_accuracy, 'accuracy'])

In [10]:
# fit the model
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=nb_epoch,
                    verbose=1,
                    validation_split=0.1)

Train on 7200 samples, validate on 800 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('\nTest categorical_cro    ssentropy:', score[0])
print('Categorical accuracy:', score[1])
print('Accuracy:', score[2])

In [68]:
#Prediction Function

def Prediction(model,user_text):
    
    # Encode the text
    encoded_docs = [one_hot(user_text, vocab_size)]
    
    # pad documents to a max length
    padded_text = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    #Prediction based on model
    prediction = model.predict(x)
    
    #Decode the prediction
    encoder = LabelBinarizer()
    encoder.fit(test_labels)
    result = encoder.inverse_transform(prediction)
    
    print("I think the best job for you is: ")
    
    return(result[0])

In [71]:
Prediction(model,'statistics and python')

I think the best job for you is: 


'Data Analyst'