### Introduction


**Probelm Statement:** Make a classifier which takes in a job description and gives the department name for it.
*   Use a neural network model
*   Make use of a pre-trained Word Embeddings (example: Word2Vec, GloVe, etc.)
*   Calculate the accuracy on a test set (data not used to train the model)

**Problem Solving Approach:** 
_Provide a brief description of steps you followed for solving this problem_
1. In part 1 - Text preprocessing , the first step was to extract the useful data in required format from given raw data in JSON format.  
2. In part 2 - EDA - This step in for finding the class distribution of the data and other useful information about the data.
3. In part 3 - Modelling and Evaluation - Different Models are trained on the given data including neural network and their accuracy is calculated.

### Part I: Text Preprocessing

_Include all text preprocesing steps like processing of json,csv files & data cleaning in this part._

Import neccessary packages in below cell

In [None]:
import json
import glob
import pandas as pd
import numpy as np

In [None]:
import matplotlib as plt

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

nltk.download('words')
words = set(nltk.corpus.words.words())

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
from collections import defaultdict
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')

from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix


%matplotlib inline

In [None]:
result = []
for f in glob.glob("data/docs/*.json"):
    with open(f, "rb") as infile:
        result.append(json.load(infile))

type(result)

columns = ['Document ID','job_industry','Company_des','job_description',
         'Job_Department','Industry','Other_Skills','Skills','Location','Keywords'
           ,'job_title']
df = pd.DataFrame(columns=columns)


In [None]:
for i in range(len(result)):
    df.at[i, 'Document ID']              =         result[i].get('_id')
    df.at[i, 'job_industry']    =        (result[i].get('api_data')).get('job_industry')
    df.at[i, 'Company_des']     =        (result[i].get('company_info')).get('Company Description')
    df.at[i, 'Industry']        =        (result[i].get('other_details')).get('Industry:')
    df.at[i, 'job_description'] =        (result[i].get('jd_information')).get('description')
    df.at[i, 'Other_Skills']    =        (result[i].get('other_details')).get('Other Skills:') 
    df.at[i, 'Skills']    =        (result[i].get('other_details')).get('Skills:') 
    df.at[i, 'Job_Department']      =    (result[i].get('other_details')).get('Department:')
    df.at[i, 'Location']        =        (result[i].get('api_data')).get('job_location')
    df.at[i, 'Keywords']        =        (result[i].get('api_data')).get('job_keywords')
    df.at[i, 'job_title']       =        (result[i].get('api_data')).get('job_title')

df.sort_values('Document ID')
    
df.head(150)

In [None]:
keys = pd.read_csv('data/document_departments.csv')


In [None]:
keys = keys.sort_values('Document ID')
keys.reset_index(drop=True, inplace=True)

In [None]:
keys.head(10)

In [None]:
type(df)

In [None]:
dataframe = pd.DataFrame()
dataframe = pd.concat([df,keys], axis=1)
dataframe = dataframe.drop('Document ID', axis=1)
dataframe['Document ID'] = keys['Document ID']

In [None]:
dataframe.head(5)

### Part II: Exploratoty Data Analysis

_Include EDA steps like finding distribution of Departments in this part, you may also use plots for EDA._

In [None]:
dataframe['Department'].value_counts()

In [None]:
#Removing the classes that are scarce 
the_list = ['Safety ','Learning and Development ','Procurement','Flight Operations ','Marine Deck ','Airline Ground Operations','QA ','Marine Service Steward ','Logistics','Flight Operations','Data entry','Marine Engineering ']

for i in range(1162):
    if (dataframe.at[i,'Department']  in the_list):
        dataframe = dataframe.drop(i)

In [None]:
dataframe['Department'].value_counts()

In [None]:
dataframe.shape

In [None]:
dataframe.iloc[[2]]

In [None]:
sampling_list = ['Public Relations ','Content','Recruitment','Engineering Design Construction']
rows_list = []
val_rep = 80
cnt=0
for i in range(1139):
    if (dataframe.at[i,'Department']  in sampling_list):
           for j in range(val_rep):
                rows_list.append(dataframe.iloc[[i]])
     

    
    



In [None]:
print(rows_list[1])

In [None]:
sampled_df = pd.DataFrame(rows_list)      

In [None]:
print(sampled_df)

In [None]:
dataframe.info()

In [None]:
dataframe.groupby('Department').count()

In [None]:
dataframe['Department'].value_counts().plot(kind = 'hist', bins= 20)

In [None]:
dataframe.plot.area(alpha = 0.5)

In [None]:
dataframe["Sentence"] = dataframe['job_description'].astype(str) + ' ' + dataframe['Job_Department'].astype(str)  + ' '+ ' ' + dataframe['Other_Skills'].astype(str) + ' ' +  dataframe['Skills'].astype(str) + ' ' + dataframe['Keywords'].astype(str) +' ' + dataframe['job_title'].astype(str) 

In [None]:
data = dataframe[['Document ID','Sentence','Department']]
#data.head(1)

In [None]:
final_data = pd.DataFrame()
final_data = data.copy()
final_data.shape[0]

In [None]:
final_data.head(3)

In [None]:
def normalize_text(text):

    text=text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', text)
    text = re.sub('@[^\s]+','', text)
    text = re.sub('#([^\s]+)', '', text)
    text = re.sub('[:;>?<=*+()&,\-#!$%\{˜|\}\[^_\\@\]1234567890’‘]',' ', text)
    text = re.sub('[\d]','', text)
    text = text.replace(".", '')
    text = text.replace("'", '')
    text = text.replace("`", '')
    text = text.replace("'s", '')
    text = text.replace("/", ' ')
    text = text.replace("\"", ' ')
    text = text.replace("\\", '')
    text = text.replace("nbsp", '')
    re.sub(' +', ' ', text)
    text=text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    #normalize some utf8 encoding
    text = text.replace("\x9d",'').replace("\x8c",'')
    text = text.replace("\xa0",'')
    text = text.replace("\x9d\x92", '').replace("\x9a\xaa\xf0\x9f\x94\xb5", '').replace("\xf0\x9f\x91\x8d\x87\xba\xf0\x9f\x87\xb8", '').replace("\x9f",'').replace("\x91\x8d",'')
    text = text.replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8",'').replace("\xf0",'').replace('\xf0x9f','').replace("\x9f\x91\x8d",'').replace("\x87\xba\x87\xb8",'')
    text = text.replace("\xe2\x80\x94",'').replace("\x9d\xa4",'').replace("\x96\x91",'').replace("\xe1\x91\xac\xc9\x8c\xce\x90\xc8\xbb\xef\xbb\x89\xd4\xbc\xef\xbb\x89\xc5\xa0\xc5\xa0\xc2\xb8",'')
    text = text.replace("\xe2\x80\x99s", "").replace("\xe2\x80\x98", '').replace("\xe2\x80\x99", '').replace("\xe2\x80\x9c", "").replace("\xe2\x80\x9d", "")
    text = text.replace("\xe2\x82\xac", "").replace("\xc2\xa3", "").replace("\xc2\xa0", "").replace("\xc2\xab", "").replace("\xf0\x9f\x94\xb4", "").replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8\xf0\x9f", "")
    text =  re.sub(r"\b[a-z]\b", "", text)
    text=re.sub( '\s+', ' ', text).strip()
    
    text=re.sub(r'\.+', ".", text)
    text=re.sub(r'\.\.+', ' ', text).replace('.', '')
    # Replace multiple dots with space
    text = re.sub('\.\.+', ' ', text) 
    # Remove single dots
    text = re.sub('\.', '', text)
    text = re.sub(r'\.{2,}', ' ', text)
    text = re.sub(r'\.{1}', '', text)
    
    return text

In [None]:
temp = []
for i in range(final_data.shape[0]):
    text =normalize_text(final_data.iloc[i]['Sentence'])

    tokens = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(word) for word in tokens]

    stopwords = nltk.corpus.stopwords.words('english')
    newtokens=[]
    for token in tokens:
        if token not in stopwords:
            newtokens.append(token)
       
    newtokens = set(newtokens)
    newtokens = list(newtokens)
    #print(newtokens)
    sent = " ".join(str(x) for x in newtokens)
    sent = " ".join(w for w in nltk.wordpunct_tokenize(sent) \
             if w.lower() in words or not w.isalpha())
    temp.append(sent)
   
    

In [None]:
final_data['tokens'] = temp

In [None]:
data_classes = set(dataframe['Department'])
data_classes = list(data_classes)

In [None]:
len(data_classes)

In [None]:
data_classes[18]

In [None]:
final_data['Department_index'] = final_data['Department'].apply(data_classes.index)

In [None]:
final_data = final_data.drop('Sentence',axis = 1)
final_data = final_data.drop('Document ID',axis = 1)
final_data.head(3)


### Part III: Modelling & Evaluation

_Include all model prepration & evaluation steps in this part._

In [None]:
#my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']
my_tags = ['Administration','Airline Ground Operations', 'Analytics', 'Back office ticketing', 'Customer service', 'Data entry', 'Digital Marketing', 'Engineering Design Construction', 'Finance', 'Flight Operations', 'IT', 'Learning and Development ', 'Logistics', 'Maintenance', 'Management Consulting', 'Marine Deck ', 'Marine Engineering ', 'Marine Service Steward ', 'Marketing', 'Operations', 'Presales ', 'Public Relations ', 'QA ', 'Recruitment', 'Sales', 'Technology', 'Ticketing']
#plt.figure(figsize=(10,4))
print(len(my_tags))
#df.tags.value_counts().plot(kind='bar');
final_data.Department.value_counts().plot(kind='bar');

In [None]:
X = final_data.tokens
y = final_data.Department
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(len(macronum), activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)


y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))

In [None]:
from gensim.models import Word2Vec

wv = gensim.models.KeyedVectors.load_word2vec_format("data_1/wiki-news-300d-1M.vec", )
wv.init_sims(replace=True)

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [None]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(final_data, test_size=0.3, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['tokens']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['tokens']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['Department'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.Department))

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec 
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

In [None]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(gensim.models.doc2vec.TaggedDocument(v.split(), [label]))
    return labeled
X_train, X_test, y_train, y_test = train_test_split(final_data.tokens, final_data.Department, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
model_dbow.min_alpha = model_dbow.alpha

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [None]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils



In [None]:
train_size = int(len(final_data) * .8)
train_posts = final_data['tokens'][:train_size]
train_tags = final_data['Department'][:train_size]

test_posts = final_data['tokens'][train_size:]
test_tags = final_data['Department'][train_size:]

max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train

In [None]:

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
num_classes = np.max(y_train) + 1

y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)


In [None]:
batch_size = 64
epochs = 25

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
validation_split=0.1)

In [None]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)


In [None]:
fig1 = plt.figure()
plt.plot(history.history['loss'],'r',linewidth=3.0)
plt.plot(history.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :CNN',fontsize=16)
fig1.savefig('loss_cnn.png')
plt.show()

In [None]:
print('Test accuracy:', score[1])