In [116]:
import numpy as np
import pandas as pd
import scipy as sc
from sklearn.metrics import classification_report

import re

In [129]:
# importing training and test data

def read_txt(path):
    doc = open(path, 'r')
    content = doc.read()
    doc.close()    
    return content
    
sum_tr = read_txt('data/LinkedIn/sum_train.txt')
exp_tr = read_txt('data/LinkedIn/exp_train.txt')
edu_tr = read_txt('data/LinkedIn/edu_train.txt')

sum_t = read_txt('data/LinkedIn/sum_test.txt')
exp_t = read_txt('data/LinkedIn/exp_test.txt')
edu_t = read_txt('data/LinkedIn/edu_test.txt')

In [130]:
# cleaning

date_pattern = re.compile(r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d\d*")
year_pattern = re.compile(r'(19|20)\d\d')

def replace_n(doc):
    return doc.replace('\nN A\n', ' %NA% ')

def replace_date(doc):
    d = date_pattern.sub('%DATE%', doc)
    d = year_pattern.sub('%YEAR%', d)
    return d

def apply_filters(doc):
    s = replace_n(doc)
    s = replace_date(s)
    return s

In [131]:
sum_tr = apply_filters(sum_tr).replace('\n', ' ')
exp_tr = apply_filters(exp_tr).replace('\n', ' ')
edu_tr = apply_filters(edu_tr).replace('\n', ' ')

sum_t = apply_filters(sum_t)
exp_t = apply_filters(exp_t)
edu_t = apply_filters(edu_t)

In [132]:
# creating dictionary (probabilities)

topics = ['smr', 'exp', 'edu']
freqs = pd.DataFrame(columns=topics)
total_freq = {}
sum_num = exp_num = edu_num = 0

def count_freqs(words, column):
    for word in words:
        if word in freqs.index:
            freqs.loc[word][column] += 1
        else:
            freqs.loc[word] = [1.0, 0.0, 0.0]
        if  word in total_freq:
            total_freq[word] += 1
        else:
            total_freq[word] = 1.0
            
splitted = sum_tr.split()
sum_num = len(splitted)
count_freqs(splitted, 'smr')

splitted = exp_tr.split()
exp_num = len(splitted)
count_freqs(splitted, 'exp')

splitted = edu_tr.split()
edu_num = len(splitted)
count_freqs(splitted, 'edu')

In [133]:
freqs['smr'] = freqs['smr'] / sum_num
freqs['edu'] = freqs['edu'] / edu_num
freqs['exp'] = freqs['exp'] / exp_num

total_sum = sum_num + exp_num + edu_num
for word in total_freq:
    total_freq[word] /= total_sum

In [153]:
# preparing tested data

_min_len = 30 # concate small rows with next
_buff = ' '

to_test = pd.DataFrame(columns=['data', 'label','prediction'])
i = 0

rows = sum_t.split('\n')
for row in rows:
    row = _buff + row
    if len(row) > _min_len:
        to_test.loc[i] = [row, 0, 0]
        i += 1
        _buff = ' '
    else:
        _buff = _buff + row

rows = exp_t.split('\n')
for row in rows:
    row = _buff + row
    if len(row) > _min_len:
        to_test.loc[i] = [row, 1, 1]
        i += 1
        _buff = ' '
    else:
        _buff = _buff + row

rows = edu_t.split('\n')
for row in rows:
    row = _buff + row
    if len(row) > _min_len:
        to_test.loc[i] = [row, 2, 2]
        i += 1
        _buff = ' '
    else:
        _buff = _buff + row

In [162]:
# classification

for index, row in to_test.iterrows():
    words = row['data'].split()
    smr = exp = edu = 0.0
    for word in words:
        if word in freqs.index:
            smr += freqs.loc[word]['smr'] / total_freq[word]
            exp += freqs.loc[word]['exp'] / total_freq[word]
            edu += freqs.loc[word]['edu'] / total_freq[word]
    mx = max(smr, exp, edu)
    if smr >= mx:
        to_test.loc[i-1]['prediction'] = 0
    if exp >= mx:
        to_test.loc[i-1]['prediction'] = 1
    if edu >= mx:
        to_test.loc[i-1]['prediction'] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [161]:
# estimation

print (classification_report(to_test.label.tolist(), to_test.prediction.tolist(), target_names=topics))

             precision    recall  f1-score   support

        smr       1.00      1.00      1.00        78
        exp       1.00      1.00      1.00       365
        edu       1.00      1.00      1.00       113

avg / total       1.00      1.00      1.00       556

