## Machine learning with plmi weighting and co-occs

This notebook will apply Naive Bayes on the co-occurences with plmi-weighting of our training set. First the PLMI score will be used instead of the frequency count. After this the PLMI score will be multiplied by the frequency count to see if this has any effect. 

In [1]:
import pickle
import random
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from sklearn import naive_bayes, metrics
from itertools import chain
from math import log
from nltk import BigramAssocMeasures

#Importing the test and train sets
with open('test_dicts.txt', 'rb') as file:
    test_lemmedreviews = pickle.load(file)
    
with open('training_dicts.txt', 'rb') as file:
    train_lemmedreviews = pickle.load(file)

In [2]:
#Creating the co-occurences
span = 3
cooccs_candidate_feature = Counter()

for p in range(1, 6):
    for sentence in train_lemmedreviews[p]:
        for i,w in enumerate(sentence):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(sentence)))) 
            for cw in [sentence[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_candidate_feature:
                    cooccs_candidate_feature[(w, cw)] += 1


In [3]:
#Selecting all features tha occur more than 9 times
cooccs_features = dict()

for idx, (f, v) in enumerate(cooccs_candidate_feature.most_common()):
    if v == 9:
    #    print(idx, f, v)
        break

    cooccs_features[f] = idx
    
print("selected features:", len(cooccs_features))

selected features: 6845


In [4]:
def ppmi(o_11, r_1, c_1, n):
    """
    Positive Pointwise Mutual Information (Church & Hanks, 1990)
    
    PMI is also available in NLTK:
    from nltk.metrics import BigramAssocMeasures
    print BigramAssocMeasures.pmi(8, (15828, 4675), 14307668)
    """
    observed = o_11
    expected = (r_1*c_1)/n 
    if expected == 0:
        res = 0
    else: 
        if observed/expected <= 0: 
            res = 0 
        else: 
            res = log(observed/expected,2)
    return max(0, res)

def plmi(o_11, r_1, c_1, n):
    """
    Positive Local Mutual Information, useful for leveraging the 
    low-frequency bias of the PPMI
    """
    res = o_11 * ppmi(o_11, r_1, c_1, n)
    return res

In [5]:
#Creating the raw frequencies count for plmi
raw_frequencies = Counter()
for p in range(1, 6):
    for rev in train_lemmedreviews[p]:
        for w in rev:
            raw_frequencies[w] += 1

#Creating the plmi scores            
plmis_lem_surface = Counter()
N = len(cooccs_candidate_feature.values())

for k,v in cooccs_features.items():
    plmis_lem_surface[k] = plmi(v, raw_frequencies[k[0]], raw_frequencies[k[1]], N)

print(plmis_lem_surface.most_common(10))

[(('costa-NOUN', 'rica-NOUN'), 163566.0021038179), (('navitas-NOUN', 'natural-NOUN'), 157660.78858215877), (('dulce-NOUN', 'de-NOUN'), 155685.75603858862), (('duncan-ADJ', 'hines-NOUN'), 145248.5840490909), (('buffalo-NOUN', 'bill-NOUN'), 144755.08555179246), (('organic-ADJ', 'mechanically-ADV'), 141409.26818245227), (('ascorbic-NOUN', 'acid-NOUN'), 138032.22368957847), (('rodeo-NOUN', 'blend-VERB'), 135047.17208362938), (('organic-ADJ', 'separate-VERB'), 133214.8290435124), (('web-NOUN', 'site-NOUN'), 132346.67722292893)]


## Machine learning with plmi weighting instead of frequencies with co-occurences

In [6]:
#Creating the matrix with co-occurence counts
fMat = np.zeros((sum([len(v) for v in train_lemmedreviews.values()]), len(cooccs_features)))
#Creating the labels
labelsVec = np.zeros((sum([len(v) for v in train_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in train_lemmedreviews[score]:
        labelsVec[docId] = score
        span = 3
        cooccs_rev = Counter()
        
        #Creating the co-occurences per review
        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        #Using the plmi weight
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                fMat[docId, cooccs_features[cooccs]] = plmis_lem_surface[cooccs]

        docId += 1
        
print(fMat, labelsVec)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [7]:
#Creating a model
clf = naive_bayes.MultinomialNB()
clf.fit(fMat, labelsVec)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
#Creating the matrix with co-occurence counts
testMat = np.zeros((sum([len(v) for v in test_lemmedreviews.values()]), len(cooccs_features)))
#Creating the goldStandard
goldStandard = np.zeros((sum([len(v) for v in test_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in test_lemmedreviews[score]:
        goldStandard[docId] = score        
        span = 3
        cooccs_rev = Counter()

        #Creating the co-occurences
        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        #Using the PLMI weighting
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                testMat[docId, cooccs_features[cooccs]] = plmis_lem_surface[cooccs]

        docId += 1
        
print(testMat, goldStandard)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [31]:
#Creating the prediction
predicted = clf.predict(testMat)

In [32]:
# accuracy
print("accuracy:", metrics.accuracy_score(predicted, goldStandard))

# precision, recall and f-measure
print("precision:", metrics.precision_score(predicted, goldStandard, average='macro'))
print("recall:", metrics.recall_score(predicted, goldStandard, average='macro'))
print("f1-measure:", metrics.f1_score(predicted, goldStandard, average='macro'))

accuracy: 0.716
precision: 0.7242050504553431
recall: 0.6138217156949433
f1-measure: 0.6554062336967708


## Machine learning with plmi weighting times frequencies with co-occurences

In [6]:
#Creating the matrix with features per review
fMat = np.zeros((sum([len(v) for v in train_lemmedreviews.values()]), len(cooccs_features)))
#Creating the labels
labelsVec = np.zeros((sum([len(v) for v in train_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in train_lemmedreviews[score]:
        labelsVec[docId] = score
        span = 3
        cooccs_rev = Counter()
        
        #Creating the co-occurences per review
        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        #Use frequency times PLMI as weighting
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                fMat[docId, cooccs_features[cooccs]] = plmis_lem_surface[cooccs]*cooccs_rev[cooccs]

        docId += 1
        
print(fMat, labelsVec)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [7]:
#Creating a model
clf = naive_bayes.MultinomialNB()
clf.fit(fMat, labelsVec)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
#Creating the matrix with features per review
testMat = np.zeros((sum([len(v) for v in test_lemmedreviews.values()]), len(cooccs_features)))
#Creating the golden standard
goldStandard = np.zeros((sum([len(v) for v in test_lemmedreviews.values()])))

docId = 0
for score in range(1, 6):
    for rev in test_lemmedreviews[score]:
        goldStandard[docId] = score
        span = 3
        cooccs_rev = Counter()
        
        #Creating the co-occurences per review
        for i,w in enumerate(rev):
            #check all co-occurring words within a 3 word span 
            span_range = list(range(max(i- span, 0), i)) 
            span_range.extend(range(i+1, min(i + span + 1, len(rev)))) 
            for cw in [rev[idx] for idx in span_range]:
                if cw != w and (cw, w) not in cooccs_rev:
                    cooccs_rev[(w, cw)] += 1
        
        #Creating the right weighting
        for cooccs in list(cooccs_rev.keys()):
            if cooccs in cooccs_features:
                testMat[docId, cooccs_features[cooccs]] = plmis_lem_surface[cooccs]*cooccs_rev[cooccs]

        docId += 1
        
print(testMat, goldStandard)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [1. 1. 1. ... 5. 5. 5.]


In [10]:
#Create a prediction
predicted = clf.predict(testMat)

In [11]:
# accuracy
print("accuracy:", metrics.accuracy_score(predicted, goldStandard))

# precision, recall and f-measure
print("precision:", metrics.precision_score(predicted, goldStandard, average='macro'))
print("recall:", metrics.recall_score(predicted, goldStandard, average='macro'))
print("f1-measure:", metrics.f1_score(predicted, goldStandard, average='macro'))

accuracy: 0.7235
precision: 0.7276499933526859
recall: 0.6217073670575455
f1-measure: 0.6624970984160434
