## Machine learning with plmi weighting and co-occs

In [12]:
import pickle
import random
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from sklearn import naive_bayes, metrics
from itertools import chain
from math import log
from nltk import BigramAssocMeasures


with open('test_dicts.txt', 'rb') as file:
    test_lemmedreviews = pickle.load(file)
    
with open('training_dicts.txt', 'rb') as file:
    train_lemmedreviews = pickle.load(file)

In [13]:
span = 3
cooccs_candidate_feature = Counter()


for p in range(1, 6):
    for sentence in train_lemmedreviews[p]:
        for i,w in enumerate(sentence):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(sentence)))) 
            for cw in [sentence[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_candidate_feature:
                    cooccs_candidate_feature[(w, cw)] += 1


In [34]:
cooccs_features = dict()

for idx, (f, v) in enumerate(cooccs_candidate_feature.most_common()):
    if v == 9:
    #    print(idx, f, v)
        break

    cooccs_features[f] = idx
    
print("selected features:", len(cooccs_features))

selected features: 6857


In [35]:
def ppmi(o_11, r_1, c_1, n):
    """
    Positive Pointwise Mutual Information (Church & Hanks, 1990)
    
    PMI is also available in NLTK:
    from nltk.metrics import BigramAssocMeasures
    print BigramAssocMeasures.pmi(8, (15828, 4675), 14307668)
    """
    observed = o_11
    expected = (r_1*c_1)/n 
    if expected == 0:
        res = 0
    else: 
        if observed/expected <= 0: 
            res = 0 
        else: 
            res = log(observed/expected,2)
    return max(0, res)

def plmi(o_11, r_1, c_1, n):
    """
    Positive Local Mutual Information, useful for leveraging the 
    low-frequency bias of the PPMI
    """
    res = o_11 * ppmi(o_11, r_1, c_1, n)
    return res

In [36]:
raw_frequencies = Counter()
for p in range(1, 6):
    for rev in train_lemmedreviews[p]:
        for w in rev:
            raw_frequencies[w] += 1

            
plmis_lem_surface = Counter()
N = len(cooccs_candidate_feature.values())

for k,v in cooccs_features.items():
    plmis_lem_surface[k] = plmi(v, raw_frequencies[k[0]], raw_frequencies[k[1]], N)

print(plmis_lem_surface.most_common(10))

[(('navitas-NOUN', 'natural-NOUN'), 156172.61658077373), (('melitta-NOUN', 'riviera-NOUN'), 139380.39742299216), (('organic-ADJ', 'mechanically-ADV'), 138413.2966405498), (('dulce-NOUN', 'de-NOUN'), 135875.59471103677), (('point-NOUN', 'corner-NOUN'), 135475.48794763364), (('freeze-NOUN', 'dry-VERB'), 134676.01632674874), (('rodeo-NOUN', 'blend-VERB'), 134530.31498142923), (('organic-ADJ', 'separate-VERB'), 134507.91990776765), (('stonewall-NOUN', 'kitchen-NOUN'), 132702.3624413754), (('new-NOUN', 'york-NOUN'), 131720.06898668676)]


In [37]:
fMat = np.zeros((sum([len(v) for v in train_lemmedreviews.values()]), len(cooccs_features)))
labelsVec = np.zeros((sum([len(v) for v in train_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in train_lemmedreviews[score]:
        labelsVec[docId] = score
        
        span = 3
        cooccs_rev = Counter()

        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                fMat[docId, cooccs_features[cooccs]] = plmis_lem_surface[cooccs]

        docId += 1
        
print(fMat, labelsVec)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [38]:
clf = naive_bayes.MultinomialNB()
clf.fit(fMat, labelsVec)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
testMat = np.zeros((sum([len(v) for v in test_lemmedreviews.values()]), len(cooccs_features)))
goldStandard = np.zeros((sum([len(v) for v in test_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in test_lemmedreviews[score]:
        goldStandard[docId] = score
        
        span = 3
        cooccs_rev = Counter()

        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                testMat[docId, cooccs_features[cooccs]] = plmis_lem_surface[cooccs]

        docId += 1
        
print(testMat, goldStandard)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [40]:
predicted = clf.predict(testMat)

In [41]:
# accuracy
print("accuracy:", metrics.accuracy_score(predicted, goldStandard))

# precision, recall and f-measure
print("precision:", metrics.precision_score(predicted, goldStandard, average='macro'))
print("recall:", metrics.recall_score(predicted, goldStandard, average='macro'))
print("f1-measure:", metrics.f1_score(predicted, goldStandard, average='macro'))

accuracy: 0.783
precision: 0.8400247105130649
recall: 0.6871217581079023
f1-measure: 0.7442028270758501


## Machine learning with plmi weighting x frequencies and co-occs

In [6]:
fMat = np.zeros((sum([len(v) for v in train_lemmedreviews.values()]), len(cooccs_features)))
labelsVec = np.zeros((sum([len(v) for v in train_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in train_lemmedreviews[score]:
        labelsVec[docId] = score
        
        span = 3
        cooccs_rev = Counter()

        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                fMat[docId, cooccs_features[cooccs]] += plmis_lem_surface[cooccs]

        docId += 1
        
print(fMat, labelsVec)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [7]:
clf = naive_bayes.MultinomialNB()
clf.fit(fMat, labelsVec)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
testMat = np.zeros((sum([len(v) for v in test_lemmedreviews.values()]), len(cooccs_features)))
goldStandard = np.zeros((sum([len(v) for v in test_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in test_lemmedreviews[score]:
        goldStandard[docId] = score
        
        span = 3
        cooccs_rev = Counter()

        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                testMat[docId, cooccs_features[cooccs]] += plmis_lem_surface[cooccs]

        docId += 1
        
print(testMat, goldStandard)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [9]:
predicted = clf.predict(testMat)

In [10]:
# accuracy
print("accuracy:", metrics.accuracy_score(predicted, goldStandard))

# precision, recall and f-measure
print("precision:", metrics.precision_score(predicted, goldStandard, average='macro'))
print("recall:", metrics.recall_score(predicted, goldStandard, average='macro'))
print("f1-measure:", metrics.f1_score(predicted, goldStandard, average='macro'))

accuracy: 0.783
precision: 0.8400247105130649
recall: 0.6871217581079023
f1-measure: 0.7442028270758501
