In [1]:
## Import lib for extracting training set from pdfs
import extract
import importlib
importlib.reload(extract)
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#  Extract training set
train_indicies = extract.extract_words()

In [3]:
# Read training files as one string
smr_content = open('sum_train.txt', 'r').read()
xprnc_content = open('exp_train.txt', 'r').read()
edct_content = open('edu_train.txt', 'r').read()

In [4]:
def string_list(s_term, s_str):
    result = []
    crnt_ptr = s_str.find(s_term) + len(s_term)
    scd_term_index=s_str.find(s_term, crnt_ptr)
    while (scd_term_index>=0):
        # count only significantly long segments (>9 characters)
        if (scd_term_index-crnt_ptr>9):
            result.append(s_term + ' ' + s_str[crnt_ptr:scd_term_index])
        crnt_ptr = scd_term_index + len(s_term)
        scd_term_index=s_str.find(s_term, crnt_ptr)
    return result

In [5]:
# Put segment from each CV into separate string in an array
sum_list = string_list("Summary", smr_content)
xprnc_list = string_list("Experience", xprnc_content)
edctn_list = string_list("Education", edct_content)

##
(jan(uary|\.|)|feb(ruary|\.|)|mar(ch|\.|)|apr(il|\.|)|may(\.|)|jun(e|\.|)|jul(y|\.|)|aug(ust|\.|)|sep(tember|\.|)|oct(ober|\.|)|nov(ember|\.|)|dec(ember|\.|))\s+\d\d*

In [6]:
# Function for extracting vocabulary specific for a segment
import re
from nltk.stem.snowball import SnowballStemmer
def create_dictionary(topic_list):
    month_expr = re.compile(r"(jan(uary|\.|)|feb(ruary|\.|)|mar(ch|\.|)|apr(il|\.|)|may(\.|)|jun(e|\.|)|jul(y|\.|)|aug(ust|\.|)|sep(tember|\.|)|oct(ober|\.|)|nov(ember|\.|)|dec(ember|\.|))")
    year_expr = re.compile(r"((19|20)\d\d|'?\d\d)")
    result = dict()
    stemmer = SnowballStemmer('english')
    for string in topic_list:
        a = re.split('[^0-9A-Za-z]', string)
        a=list(filter(lambda x: x!='', a))
        for i in range(len(a)):
            a[i] = a[i].casefold()
            if re.fullmatch(year_expr, a[i]):
                a[i] = '!year'
            elif re.fullmatch(month_expr, a[i]):
                a[i] = '!month'
            else:
                a[i] = stemmer.stem(a[i])
            if (a[i] in result):
                result[a[i]] += 1
            else:
                result[a[i]] = 1
                
    return result


In [7]:
# Extract vocabulary specific for a segment
sum_dct = create_dictionary(sum_list)
xprnc_dct = create_dictionary(xprnc_list)
edctn_dct = create_dictionary(edctn_list)

In [8]:
# Functions for calculating term frequencies and corpus dictionary creation
def concat_dct(d1, d2):
    d3 = dict(d1)
    for key in d2.keys():
        if (key in d3):
            d3[key] += d2[key]
        else:
            d3[key] = d2[key]
    return d3

def sumDictValues(dctnr):
    N = 0
    for val in dctnr.values():
        N = + N + val
    return N

def calc_probs(dctnr):
    prob_dctnr = dict()
    tot_size = sumDictValues(dctnr)
    for key in dctnr.keys():
        prob_dctnr[key] = dctnr[key]/tot_size
    return prob_dctnr

In [9]:
# Uniting into corpus dictionary
new_dict = concat_dct(xprnc_dct, edctn_dct)
whole_dict = concat_dct(new_dict, sum_dct)

In [10]:
# Calculating probabilities
sum_prb = calc_probs(sum_dct)
xprnc_prb = calc_probs(xprnc_dct)
edctn_prb = calc_probs(edctn_dct)
whole_dict_prb = calc_probs(whole_dict)

In [11]:
# Test data generation
smr_test_content = open('sum_test.txt', 'r').read()
exp_test_content = open('exp_test.txt', 'r').read()
edu_test_content = open('edu_test.txt', 'r').read()

In [12]:
def replace_with_space(matchobj):
    return " "

def construct_marked_set(new_content):
    CRIT_LENGTH_LIMIT = 100
    result = np.zeros(4).reshape(1,4)
#    rep_pattern = re.compile(r"((Summary|Experience|Education)|[^A-Za-z0-9\s])")
    rep_pattern = re.compile(r"([^A-Za-z0-9\s])")
    category = 1
    for string in new_content:
        a = re.sub(rep_pattern, replace_with_space, string)
        cumulative_str = ''
        for substr in a.split('\n'):
            cumulative_str += substr + ' '
            if(len(cumulative_str) > CRIT_LENGTH_LIMIT):
                result = np.append(result, np.asarray([cumulative_str, category, 0, -3000.0]).reshape(1,4), axis=0)
                cumulative_str = ''
        if(len(cumulative_str) > 2):
                result = np.append(result, np.asarray([cumulative_str, category, 0, -3000.0]).reshape(1,4), axis=0)
        category += 1
    return result[1:]

In [13]:
test_set = construct_marked_set([smr_test_content, edu_test_content, exp_test_content])

In [14]:
import math as m
from nltk.stem.snowball import SnowballStemmer
def calculate_likelihood(string, dct, corp_dct, prb_segment):
    stemmer = SnowballStemmer('english')
    month_expr = re.compile(r"(jan(uary|\.|)|feb(ruary|\.|)|mar(ch|\.|)|apr(il|\.|)|may(\.|)|jun(e|\.|)|jul(y|\.|)|aug(ust|\.|)|sep(tember|\.|)|oct(ober|\.|)|nov(ember|\.|)|dec(ember|\.|))")
    year_expr = re.compile(r"((19|20)\d\d|'?\d\d)")
    num = 1.0
    den = 1.0
    result = .0
    for word in string.split():
        wrd = word.casefold()
        if re.fullmatch(year_expr, wrd):
            wrd = '!year'
        elif re.fullmatch(month_expr, wrd):
            wrd = '!month'
        else:
             wrd = stemmer.stem(wrd)
        #print(wrd)
        if wrd in dct:
            num *= dct[wrd]
            den *= corp_dct[wrd]
            result += m.log2(prb_segment*num/den)
    #print(num)
    #print(den)
    return result
    
def test_algorithm(test_set, dictionaries, corpus_dict):
    #likelihoods = np.zeros(len(dictionaries))
    prb_corpus = calc_probs(corpus_dict)
    i=0
    for a_dict in dictionaries:
            prb_segment = sumDictValues(a_dict) / sumDictValues(corpus_dict)
            print('Segment ' + repr(i+1) + ' portion in the dictionary ' + repr(prb_segment))
            prb_dict = calc_probs(a_dict)
            for line in test_set:
#                print('new_like')
#                print(line[0] + "ttteeest")
                likelihood = calculate_likelihood(line[0], prb_dict, prb_corpus, prb_segment)
                if likelihood > np.float(line[3]):
                    line[3] = likelihood
                    line[2] = i+1
            i+=1

In [15]:
test_algorithm(test_set, [sum_dct, edctn_dct, xprnc_dct], whole_dict)

Segment 1 portion in the dictionary 0.16698797913359037
Segment 2 portion in the dictionary 0.14776593331821275
Segment 3 portion in the dictionary 0.6852460875481968


In [16]:
# precision, recall, F1-score
def quality_assessment(a, b, ctgr):
    b_correct = ((a.astype(int) == b.astype(int)) * a.astype(int)==ctgr).sum()
    b_attempts = ((b.astype(int) == ctgr)).sum()
    a_total = ((a.astype(int) == ctgr)).sum()
    prec = b_correct/b_attempts
    rcll = b_correct/a_total
    f_one = 2*prec*rcll/(prec+rcll)
    print ('precision = %f' %prec)
    print ('recall =', rcll)
    print ('F1-score =', f_one)
    return prec, rcll, f_one

In [17]:
# summary identification assessment
quality_assessment(test_set[:,1], test_set[:,2], 1)

precision = 0.446809
recall = 0.477272727273
F1-score = 0.461538461538


(0.44680851063829785, 0.47727272727272729, 0.46153846153846156)

In [18]:
# education identification assessment
quality_assessment(test_set[:,1], test_set[:,2], 2)

precision = 0.775510
recall = 1.0
F1-score = 0.873563218391


(0.77551020408163263, 1.0, 0.87356321839080453)

In [19]:
# experience identification assessment
quality_assessment(test_set[:,1], test_set[:,2], 3)

precision = 0.876543
recall = 0.806818181818
F1-score = 0.840236686391


(0.87654320987654322, 0.80681818181818177, 0.84023668639053251)

In [20]:
print(confusion_matrix(test_set[:,1], test_set[:,2]))

[[ 21   3  20]
 [  0  38   0]
 [ 26   8 142]]


In [21]:
# Extract test set from 'other' folder
file_number_in_folder=6
extract.extract_other_pdfs(file_number_in_folder)



In [22]:
# Test data generation
other_test_content = open('test_other.txt', 'r').read()

In [23]:
# Head of selected CV
other_test_content[:100]

'Stefan Petranek  78 Vermont Street Rochester, NY 14609     spetrane@yahoo\n.com     (585) 256 0296   '

In [24]:
test_set_other = construct_marked_set([other_test_content])

In [25]:
test_algorithm(test_set_other, [sum_dct, edctn_dct, xprnc_dct], whole_dict)

Segment 1 portion in the dictionary 0.16698797913359037
Segment 2 portion in the dictionary 0.14776593331821275
Segment 3 portion in the dictionary 0.6852460875481968


In [26]:
def print_recognition_results(test_set):
    category=''
    for line in test_set:
        if line[2] == '1':
            category = 'Summary'
        elif line[2] == '2':
            category = 'Education'
        else:
            category = 'Experience'
        print(line[0][:50] + '...\t\t' + category)

In [27]:
print_recognition_results(test_set_other)

Stefan Petranek  78 Vermont Street Rochester  NY 1...		Experience
  EDUCATION   Rochester Institute of Technology   ...		Education
Areas of interest   photography  video  and instal...		Summary
  Bowdoin College   Brunswick  ME  A B  Biology  m...		Education
 with  Honors  in Biology  Phi Beta Kappa Society ...		Education
Study abroad coursework at Universidad Austral de ...		Education
 Tweaking the Periodic Table  Reflections on Scien...		Education
  Artist Talk    Panel discussion for  Naturally  ...		Experience
  Draft Magazine   Page 26  Issue 3  Independent S...		Education
  Contributing photographer  Published by  Rochest...		Education
  SPE Mid Atlantic Regional Conference  George Mad...		Education
  Graduate Student Scholarship   Society of Photog...		Education
  North American Nature Photography Association In...		Education
  Bowdoin College  2004   SELECTED EXHIBITIONS    ...		Education
  Video work  Spring  selected by curator Brenda L...		Summary
  RIT Faculty Show   Video 