In [105]:
import pandas as pd
import os
import nltk,re
import string
import unicodedata
import numpy as np
import scipy as sp

from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import LancasterStemmer, WordNetLemmatizer,SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word

In [106]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('chained_assignment',None) 


In [107]:
def get_all(folder):
    _all = os.listdir(folder)
    _all = [folder+'\\' + i for i in _all]
    return _all

def extract_folder(folder):
	text_list = []
	for _file in get_all(folder):
		stri = open(_file, 'r',encoding="utf8").read()
		# stri = " ".join(stri.split()).translate(str.maketrans('', '', string.punctuation))
		stri = " ".join(stri.split())
		text_list.append(stri)
	return text_list

def reject_list():
	return set(extract_folder('output')) - set(extract_folder('pass'))

In [108]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        # print(word)
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def remove_numbers(words):
	no_numbers = []
	for word in words:
		no_number = ''.join([i for i in word if not i.isdigit()])
		no_numbers.append(no_number)
	return no_numbers

def remove_emails(paragraph):
	regex = r"\S*@\S*\s?"
	result = re.sub(regex, "", paragraph, 0)
	return result

def normalize(words):
    words = remove_non_ascii(words)
    # words = remove_emails(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    # words = replace_numbers(words)
    words = remove_numbers(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    words = lemmatize_verbs(words)
    return words

In [64]:
def tokenize_and_train(vect,X_train):
    X_train_dtm = vect.fit_transform(X_train)
    print(('Features: ', X_train_dtm.shape[1]))
    return X_train_dtm

def transform(vect,X_test):
	X_test_dtm = vect.transform(X_test)
	return X_test_dtm

def train(X_train_dtm, y_train):
    nb.fit(X_train_dtm, y_train)
    return nb

def run_test(X_test_dtm):
    y_pred_class = nb.predict(X_test_dtm)
    # print(y_pred_class)
    return y_pred_class

def metric(y_test,y_pred_class):
	from sklearn import metrics
	print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
	print('Precision: ', metrics.precision_score(y_test, y_pred_class))
	print('Recall: ', metrics.recall_score(y_test, y_pred_class))


def predict(matrix):
	result = nb.predict(matrix)
	return result


def norminalize_all(pass_df):
    for i in range(len(pass_df)):
	    a = remove_emails(pass_df["text"][i])
	    a = nltk.WhitespaceTokenizer().tokenize(a)
	    c = ' '.join(map(str, a))
	    c = nltk.tokenize.WordPunctTokenizer().tokenize(c)
	    c = normalize(c)
	    c = ' '.join(map(str, c))
	    e = nltk.WhitespaceTokenizer().tokenize(c)
	    e = ' '.join(map(str, e))
	    pass_df["text"][i] = e

In [65]:
pass_df = pd.DataFrame({"text":extract_folder('pass'),"passs":1})
reject_df = pd.DataFrame({"text":extract_folder('reject'),"passs":0})
pass_df = pass_df.append(reject_df, ignore_index=True)
new_df = pd.DataFrame({"text":extract_folder('new-resume')})

In [66]:
norminalize_all(pass_df)
norminalize_all(new_df)

In [67]:
new_df.head()

Unnamed: 0,text
0,ang run hao sebast singap cit pasir ris driv educ nanyang technolog univers bachel engin chem bi...
1,alis gao mobl email educ nat univers singap singap b sc maj stat spec fin busy stat hono distinc...
2,quek jian hong joel address minton hougang street singap e mail contact cur aug educ nat univers...
3,e mail tel jingyu zhang pres educ background nat univers singap maj msc fin engin cours stochast...
4,christopher tan educ imp colleg busy school london uk msc man recipy icb scholarship award relev...


In [109]:
feature_cols = ['text']
X = pass_df[feature_cols]
y = pass_df['passs']

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20, stratify =y, test_size=0.25)

In [111]:
from sklearn import model_selection
kf = model_selection.StratifiedKFold(n_splits=4, shuffle=True ,random_state =2)

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [123]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train.text)
X_test_dtm = vect.transform(X_test.text)
X_new = new_df['text']
print((X_train_dtm.shape))
print((X_test_dtm.shape))

(45, 2930)
(16, 2930)


In [124]:
# Shape of other four feature columns
X_train.drop('text', axis=1).shape

(45, 0)

In [125]:
# Cast other feature columns to float and convert to a sparse matrix.
extra = sp.sparse.csr_matrix(X_train.drop('text', axis=1).astype(float))
extra.shape

(45, 0)

In [126]:
# Combine sparse matrices.
X_train_dtm_extra = sp.sparse.hstack((X_train_dtm, extra))
X_train_dtm_extra.shape

(45, 2930)

In [127]:
# Repeat for testing set.
extra = sp.sparse.csr_matrix(X_test.drop('text', axis=1).astype(float))
X_test_dtm_extra = sp.sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape

(16, 2930)

In [128]:
# Use logistic regression with text column only.
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
metric(y_test,y_pred_class)
X_new_dtm = transform(vect,X_new)
print(logreg.predict(X_new_dtm))

('Accuracy: ', 0.6875)
('Precision: ', 0.6666666666666666)
('Recall: ', 0.5714285714285714)
[1 1 1 0 0 1 0 0 0 1 0 1 0 0]


In [129]:
#Multinominal
nb = MultinomialNB()
nb = train(X_train_dtm, y_train)
y_pred_class = run_test(X_test_dtm)
metric(y_test,y_pred_class)
X_new_dtm = transform(vect,X_new)
print(predict(X_new_dtm))

('Accuracy: ', 0.6875)
('Precision: ', 0.6666666666666666)
('Recall: ', 0.5714285714285714)
[1 0 1 1 0 1 0 1 0 1 0 1 0 0]


In [138]:
n = 0
accuracy = []
roc = []
matrix = []

print("~~~~ CROSS VALIDATION each fold ~~~ CREDIT TO ANTON ~~~~")
for train_index, test_index in kf.split(X, y):
    vect = CountVectorizer()
    X_train_dtm = vect.fit_transform(X.iloc[train_index].text)
    X_test_dtm = vect.transform(X.iloc[test_index].text)
    logreg = LogisticRegression()
    logreg.fit(X_train_dtm, y.iloc[train_index])
    y_pred_class = logreg.predict(X_test_dtm)
    X_new_dtm = transform(vect,X_new)
    prediction = logreg.predict(X_new_dtm)
    accuracy.append(np.mean(y.iloc[test_index] == logreg.predict(X_test_dtm)))
    lr_pred_proba = logreg.predict_proba(X_test_dtm)[:,1]
    matrix.append(metrics.confusion_matrix(y_true=y.iloc[test_index], y_pred=lr_pred_proba > .5))
    roc.append(metrics.roc_auc_score(y_true=y.iloc[test_index], y_score=lr_pred_proba> .5))
    n += 1
    
    print('Model {}'.format(n))
    print('ROC AUC: {}'.format(roc[n-1]))
    metric(y.iloc[test_index],y_pred_class)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')


print("~~~~ SUMMARY OF CROSS VALIDATION ~~~~")
print('Mean of Accuracy for all folds : {} '.format(np.mean(accuracy)))
print('Mean of ROC AUC: {}'.format(np.mean(roc)))
print(prediction)

~~~~ CROSS VALIDATION each fold ~~~ CREDIT TO ANTON ~~~~
Model 1
ROC AUC: 0.8015873015873016
('Accuracy: ', 0.8125)
('Precision: ', 0.8333333333333334)
('Recall: ', 0.7142857142857143)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model 2
ROC AUC: 0.6746031746031746
('Accuracy: ', 0.6875)
('Precision: ', 0.6666666666666666)
('Recall: ', 0.5714285714285714)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model 3
ROC AUC: 0.875
('Accuracy: ', 0.8666666666666667)
('Precision: ', 0.7777777777777778)
('Recall: ', 1.0)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model 4
ROC AUC: 0.7708333333333333
('Accuracy: ', 0.7857142857142857)
('Precision: ', 0.8)
('Recall: ', 0.6666666666666666)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~ SUMMARY OF CROSS VALIDATION ~~~~
Mean of Accuracy for all folds : 0.7880952380952381 
Mean of ROC AUC: 0.7805059523809523
[1 1 1 0 0 1 0 1 0 1 0 0 0 0]
