In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from time import time

In [2]:
columns = ['','text_no_tag', 'polarity', 'length']

dftrain = pd.read_csv('csv/finaltrain.csv',
                     header = 0, 
                     usecols = [1,2], 
                     names=columns,
                     encoding ='ISO-8859-1')

In [3]:
dftrain.head()

Unnamed: 0,text_no_tag,polarity
0,"@USER @URL - aw , that ' s a bummer . you sho...",0
1,is upset that he can not update his facebook b...,0
2,@USER i dived many times for the ball . manage...,0
3,my whole body feels itchy and like its on fire,0
4,"@USER no , it ' s not behaving at all . i am m...",0


In [7]:
from sklearn.utils import shuffle

In [8]:
dftrain = shuffle(dftrain)
dftrain = dftrain[:20000]

In [9]:
tweets = []
for t in dftrain['text_no_tag']:
    #print(df_len_tweets.text[i].replace(s, ''))
    tweets.append(t)

In [10]:
str(tweets)



In [13]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

words = tokenizer.tokenize(str(tweets))

In [14]:
print(words)



In [15]:
from collections import Counter, deque
from math import log

class SOPMI:
    def __init__(self, pos_seeds, neg_seeds, words, near = 10):
        self.w_count = Counter()
        self.p_count = dict()
        self.pos_seeds = set(pos_seeds)
        self.neg_seeds = set(neg_seeds)
        window = deque()
        window_size = near*2+1
        for word in words:
            window.append(word)
            if len(window)>window_size:
                window.popleft()
            elif len(window)<window_size:
                continue
            current_word = window[near]
            if current_word in self.pos_seeds or current_word in self.neg_seeds:
                self.w_count[current_word] += 1
                if current_word not in self.p_count:
                    self.p_count[current_word] = Counter()
                for window_word in window:
                    self.p_count[current_word][window_word] += 1

    def hits(self,word):
        return self.w_count[word]+1
    
    def hits_near(self,w1,w2):
        if w1 in self.p_count:
            return self.p_count[w1][w2]+1
        else:
            return 1
     
    def so(self, word):
        score = 0
        for pos_seed in self.pos_seeds:
            score += log(self.hits_near(pos_seed,word)/self.hits(pos_seed))
        for neg_seed in self.neg_seeds:
            score -= log(self.hits_near(neg_seed,word)/self.hits(neg_seed))
        return score   

In [19]:
with open('negative-words.txt', 'r', encoding='ISO-8859-1', newline='') as f:
    neg_seeds = f.read().split()

neg_seeds = shuffle(neg_seeds)
neg_seeds = neg_seeds[:50]
print(neg_seeds)

['plague', 'slow-moving', 'died', 'irked', 'acrimony', 'forbidding', 'moronic', 'irreformable', 'crumbling', 'squash', 'scarier', 'fetid', 'detracting', 'incorrigibly', 'neurotically', 'illogically', 'agony', 'wrinkled', 'untouched', 'mordantly', 'brutality', 'hampered', 'waning', 'insincerity', 'smash', 'subvert', 'belated', 'onerously', 'unproductive', 'uncouth', 'costlier', 'irresolute', 'angrily', 'hindrance', 'leakages', 'jagged', 'bumped', 'pigs', 'corrosions', 'choke', 'disillusioned', 'throb', 'numb', 'impossiblity', 'darker', 'blurred', 'disfavor', 'mudslinger', 'intransigent', 'grapple']


In [21]:
with open('positive-words.txt', 'r', encoding='ISO-8859-1', newline='') as f:
    pos_seeds = f.read().split()
pos_seeds = shuffle(pos_seeds)
pos_seeds = pos_seeds[:50]
print(pos_seeds)
#pos_seeds = ['good','nice','excellent','positive','fortunate','correct','superior']
#neg_seeds = ['bad','nasty','poor','negative','unfortunate','wrong','inferior']

['soft', 'attractively', 'meritorious', 'excelled', 'splendor', 'unquestionable', 'savings', 'inviolable', 'pleasing', 'replaceable', 'trouble-free', 'elate', 'beckons', 'compliant', 'gladness', 'stupendous', 'admiringly', 'amicably', 'intriguing', 'merit', 'well-wishers', 'shiny', 'redeem', 'resounding', 'inspiring', 'versatility', 'resolute', 'euphoria', 'solid', 'accomplishments', 'indulgent', 'competitive', 'unabashedly', 'pampered', 'assuring', 'ethical', 'happily', 'advantage', 'blockbuster', 'overture', 'steadfastly', 'flexible', 'leading', 'astonished', 'recommendations', 'gifted', 'enterprising', 'lovably', 'inspiration', 'overtaking']


In [22]:
model = SOPMI(pos_seeds, neg_seeds, words)

In [23]:
model.so('social')

-0.6286086594223734

In [24]:
model.so('torture')

-0.6286086594223734

In [25]:
from nltk.corpus import stopwords

sw = stopwords.words('english')

In [26]:
vocab = set(words)
so_values = [(word, model.so(word)) for word in vocab if word not in pos_seeds and word not in neg_seeds and word not in sw]
so_values.sort(key = lambda x: x[1])

In [27]:
so_values[:50]

[('today', -4.0943445622221),
 ('like', -3.518980417318539),
 ('bad', -3.518980417318537),
 ('ok', -3.401197381662155),
 ('never', -3.4011973816621515),
 ('good', -3.113515309210376),
 ('friend', -3.1135153092103733),
 ('sorry', -3.1135153092103733),
 ('happy', -3.1135153092103716),
 ('could', -2.825833236758598),
 ('think', -2.7080502011022114),
 ('going', -2.7080502011022114),
 ('yeah', -2.7080502011022096),
 ('play', -2.7080502011022096),
 ('well', -2.7080502011022096),
 ('super', -2.7080502011022096),
 ('away', -2.7080502011022096),
 ('since', -2.4203681286504297),
 ('mine', -2.420368128650428),
 ('phone', -2.420368128650428),
 ('make', -2.420368128650428),
 ('birthday', -2.420368128650428),
 ('days', -2.420368128650428),
 ('still', -2.420368128650428),
 ('thanks', -2.3025850929940477),
 ('URL', -2.2380465718564775),
 ('oh', -2.1326860561986516),
 ('much', -2.0149030205422696),
 ('best', -2.014903020542266),
 ('two', -2.014903020542266),
 ('come', -2.014903020542266),
 ('new', -2.0

In [28]:
so_values[-50:]

[('check', 0.06453852113757019),
 ('nite', 0.06453852113757019),
 ('fika', 0.06453852113757019),
 ('filipino', 0.06453852113757019),
 ('positivity', 0.06453852113757019),
 ('comes', 0.06453852113757019),
 ('recovery', 0.06453852113757019),
 ('ev1', 0.06453852113757019),
 ('tweeps', 0.06453852113757019),
 ('reveals', 0.06453852113757019),
 ('thats', 0.06453852113757019),
 ('struck', 0.06453852113757019),
 ('awesum', 0.06453852113757019),
 ('change', 0.06453852113757019),
 ('small', 0.06453852113757019),
 ('ill', 0.06453852113757019),
 ('adnams', 0.06453852113757019),
 ('yea', 0.06453852113757019),
 ('unless', 0.06453852113757019),
 ('bargain', 0.06453852113757019),
 ('five', 0.06453852113757019),
 ('beat', 0.06453852113757019),
 ('go', 0.06453852113757175),
 ('night', 0.06453852113757197),
 ('link', 0.47000362924573036),
 ('welcome', 0.47000362924573036),
 ('cheers', 0.47000362924573036),
 ('pow', 0.47000362924573036),
 ('illustrator', 0.47000362924573036),
 ('spend', 0.4700036292457339

In [29]:
len(so_values)

19233

In [None]:
#[x[0] for x in my_tuples]
scores = []
for t in dftrain['text_no_tag']:
    score = 0
    for word in t.split():
        for i in range(0,len(so_values)):
            if word == so_values[i][0]:
                #print(so_values[i][1])
                score = score + so_values[i][1]
    scores.append(score)

In [None]:
print(scores)

In [85]:
len(scores)

359

In [86]:
dftrain['SO_PMI_score'] = scores

In [87]:
dftrain.head()

Unnamed: 0,text_no_tag,polarity,SO_PMI_score
0,@USER i love my kindle2 . not that the is co...,4,28.541443
1,reading my kindle2 . love it . lee childs is...,4,107.484659
2,"ok , first assesment of the kindle 2 . it fuc...",4,108.663729
3,@USER you will love your kindle2 . i have had ...,4,215.560601
4,@USER fair enough . but i have the kindle2 and...,4,48.018729


In [88]:
dftrain.tail()

Unnamed: 0,text_no_tag,polarity,SO_PMI_score
354,"after using latex a lot , any other typeset ma...",4,92.161728
355,"on that note , i hate word . i hate pages . i ...",0,113.474864
356,ah . back in a *real* text editing environme...,4,81.123489
357,"trouble in iran , i see . hmm . iran . iran so...",0,162.264101
358,reading the tweets coming out of iran . the w...,0,106.065665
