## Hello!
**This is my little machine.** <br>
<font color=pink>Its task is to match CVs/resumes to job/project description according to the degree of suitability.</font> <br>
<br>
[Author: Michael Christian Suhendra](https://www.youtube.com/watch?v=dQw4w9WgXcQ "Michael Christian Suhendra") <br>
# 💯🔥👌🏻😂

In [11]:
# prerequisites
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Data collection <br>

The data generator reads a .csv file as mapping, containing the file name 
of the CV, the file name of the job post, and the corresponding label, 
and populates an array for both CV/job post pair and label respectively.

Returns a numpy array with CV/job post pair (x) 
and a numpy array with labels (y).

In [35]:
import numpy as np
import os
import csv

# reads a .txt file and returns the text body
def read_file(folder, file):
    data = ''
    if os.path.isfile(os.path.join(folder, file + '.txt')):
        filename = os.path.join(folder, file)
        with open(filename + '.txt', 'r') as file:
            data = file.read().replace('\n', '')
    return data
    
# create array with .csv file for mapping as input
# TODO number of pairs with label n:
# e.g. there will be more pairs with label 5 than 2
# this transforms the entire input data into an array
# should this be stored somewhere?
def generate_data(filename):
    pairs = []
    labels = []
    
    with open(filename, 'rt') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        next(csvReader)
        for row in csvReader:
            cv_text = read_file('cv/', row[0])
            jobpost_text = read_file('jobpost/', row[1])
            
            pairs += [[cv_text, jobpost_text]]
            labels.append(row[2])
    
    return np.array(pairs), np.array(labels)

pairs, labels = generate_data('data.csv')
# print the first 5 pairs/labels
print(pairs[:5])
print(labels[:5])

[['Hey tayo hey tayo dia bis kecil ramah' '']
 ['Karl Marx (* 5. Mai 1818 in Trier; â€\xa0 14. MÃ¤rz 1883 in London) war ein deutscher Philosoph, Ã–konom, Gesellschaftstheoretiker, politischer Journalist, Protagonist der Arbeiterbewegung sowie Kritiker des Kapitalismus und der Religion.Zusammen mit Friedrich Engels wurde er zum einflussreichsten Theoretiker des Sozialismus und Kommunismus, deren GrundzÃ¼ge die beiden in der programmatischen Schrift Manifest der Kommunistischen Partei (1848) niederlegten. Als Marxâ€™ Hauptwerk gilt Das Kapital, dessen erster Band noch zu seinen Lebzeiten im Jahr 1867 erschien; die beiden folgenden BÃ¤nde wurden posthum von Engels herausgegeben. Einflussreich waren auch seine politischen AktivitÃ¤ten in der entstehenden internationalen Arbeiterbewegung (Internationale Arbeiterassoziation), in der er zeitweise eine intellektuelle FÃ¼hrungsrolle Ã¼bernahm.Die theoretischen Grundlagen des nach Marx benannten Marxismus beeinflussen die Diskurse der Geschicht

### Text preprocessing <br> 
This module gets raw text as input,
'cleans' the text
and returns the vector representation of words in the text.

In [39]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stop_words = set(stopwords.words('german'))
germanStemmer = SnowballStemmer('german', ignore_stopwords = True)

def cleanup_text(text):
    # remove extra spaces
    text = ' '.join(text.split())
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # stemming
    text = ' '.join(germanStemmer.stem(word) for word in text.split())
    # remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # lowercase
    return text.lower()

# Returns words as tokens (an array, with unique words as columns)
def tokenize(text):
    tokens = word_tokenize(text)
    for w in tokens:
        w = lemmatize(w)
    result = [i for i in tokens if not i in stop_words]
    return result

# Returns words in base form (stemming)
def lemmatize(text):
    text = germanStemmer.stem(text)
    return text

for p in pairs:
    p[0] = cleanup_text(p[0])
    p[1] = cleanup_text(p[1])
    
# print the first 5 pairs/labels
print(pairs[:5])
print(labels[:5])

[['hey tayo hey tayo dia kecil ramah' '']
 ['karl marx 5 mai 1818 trier â€ 14 mã¤rz 1883 london deutsch philosoph ã–konom gesellschaftstheoret polit journalist protagonist arbeiterbeweg sowi kritik kapitalismus religionzusamm friedrich engel wurd einflussreich theoret sozialismus kommunismus grundzã¼g beid programmat schrift manif kommunist partei 1848 niederlegt marxâ€™ hauptwerk gilt kapital erst band lebzeit jahr 1867 erschi beid folgend bã¤nde wurd posthum engel herausgegeb einflussreich polit aktivitã¤t entsteh international arbeiterbeweg international arbeiterassoziation zeitweis intellektuell fã¼hrungsroll ã¼bernahmdi theoret grundlag marx benannt marxismus beeinfluss diskurs geschichtswissenschaft soziologi wirtschaft politikwissenschaft gegenwart'
  '']
 ['' '']
 ['' '']
 ['' '']]
['4' '5' '5' '5' '1']


### Preparing train and test set <br>
The entire prepared dataset is split into train and test data. To ensure a fair experiment, the sets are generated once and will be used for all models/algorithms below. In other words, the contents of the train and test data are the same for all models. <br> However, when the program restarts, the entire dataset will be split differently.

In [None]:
epochs = 15
test_ratio = 0.2

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(pairs, labels, test_size=test_ratio, random_state=42)

### Helper methods for data analysis

In [32]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

# copied from documentation

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

def model_classification_report(y_true, y_pred, labels):
    return classification_report(y_true, y_pred, labels=labels)


### The bag of words method

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

# initiate the vectorizer object
cvec = CountVectorizer()

# the text will be tokenized inside the CountVectorizer anyway.
cv_vector_bow = cvec.fit_transform(x_train[:,0]).toarray()
jobpost_vector_bow = cvec.fit_transform(x_train[:,1]).toarray()

# classification algorithm: logistic regression
# TODO end to end training



AttributeError: 'numpy.ndarray' object has no attribute 'lower'

### GloVe

### Feed-forward Neural Network

In [None]:
from keras.models import Model, Sequential
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Conv1D, MaxPooling1D
from keras.optimizers import RMSprop
from keras import backend as K

#TODO add softmax layer in the end with 5 categories
def create_base_network(input_shape):
    input = Input(shape=input_shape)
    x = Flatten()(input)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)



### Siamese CNN

In [None]:
#TODO maximum length from embedding layer
def create_cnn_network(embedding_length):
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(embedding_length, 1)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    return model

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    sqaure_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * sqaure_pred + (1 - y_true) * margin_square)




This is the main module of the project. <br>
The model can be chosen from the following:

    - Bag-of-Words
    - Tf-idf Vectorizer
    - Doc2Vec/GloVe
    - Feed-forward Neural Network
    - Convolutional Neural Network
    - Siamese Convolutional Neural Network
    
<br>
TODO create the models in separate files.

In [None]:
def __init__():
    print('init')

# General machine learning functions: fit the model and compute F-score.

