I am trying to understand Neural Network, Deep Learning, and Convolutional Neural Networks(CNNs).
I am designing a CNNs on the resume data sets to identify their job title based on the job descriptions in the resume.

# Import all required libraries

In [None]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, merge
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model
from keras.callbacks import EarlyStopping
import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import codecs
import matplotlib.pyplot as plt

from subprocess import check_output
stop_words = set(stopwords.words('english'))

In [None]:
EMBEDDING_DIM = 300 # word vector dimension, as in the golve
MAX_VOCAB_SIZE = 20000 # number of unique words
MAX_SEQUENCE_LENGTH = 200 # number of words in a job description

# training params
batch_size = 256
num_epochs = 10

# Load Train Data

In [None]:
data = pd.read_csv("result.csv")
data['description'] = data['description'].astype('str')
data.head()

# Data Preprocessing

In [None]:
data.description.tolist()

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['description'].apply(tokenizer.tokenize)
data['tokens'] = data['tokens'].apply(lambda vec: [word for word in vec if word not in stop_words]) # remove stopwords
data['label'] = data['title'].map(lambda x: 1 if 'Data Scientist' in x and 'Data Analyst' in x else 0) 
# remove samples where both data scientist and data analyst exist in the title
data.drop(data[data.label==1].index, inplace = True)
# label data scientist to 1 and data analyst to 0
data['label'] = data['title'].map(lambda x: 1 if 'data scientist' in x.lower() else 0)

# shuffle the data
from sklearn.utils import shuffle
data = shuffle(data)
data.sample(10)

In [None]:
# list all the words in the dataset
all_training_words = [word for tokens in data['tokens'] for word in tokens]
# count the number of words
training_sentence_lengths = [len(tokens) for tokens in data['tokens']]
# number of unique words
TRAINING_VOCAB = sorted(list(set(all_training_words)))

print("%s words total, with a vocabulary size of %s" %(len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

# Load the pretrained glove vectors and word embeddings

In [None]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('e:/download/gensim_glove_vectors.bin', binary=True)


In [None]:
# define a tokenizer, keep the most common words in the dataset
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
# train on the data['description']
tokenizer.fit_on_texts(data['description'].tolist())
# turn the text to sequences
training_sequences = tokenizer.texts_to_sequences(data['description'].tolist())
# the word index
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))
# padding the sequences(text) to the same length
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# mapping the sequence(text) to glove vector
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word, index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

# Define a Convolutional Neural Network following Yoon Kim model

In [None]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    
    embedding_layer = Embedding(num_words, embedding_dim, weights=[embeddings], input_length=max_sequence_length,
                                trainable=trainable)
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    convs = []
    filter_sizes = [3,4,5]
    
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)
    
    l_merge = keras.layers.Concatenate(axis=1)(convs)
    
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)
    
    if extra_conv==True:
        x = Dropout(0.5)(l_merge)
    else:
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()
    return model

In [None]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM,
                1, False,True)

In [None]:
x_train = train_cnn_data
y_tr = data['label'].values

# Define the callbacks

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

# Train our model

In [None]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.2,
                shuffle=True, batch_size=batch_size)