### SO-PMI

In [2]:
import pandas as pd

In [3]:
columns = ['id','text_final', 'polarity', 'length']

dftrain = pd.read_csv('finaltrain.csv',
                     header = 0, 
                     usecols = [1,2], 
                     names=columns,
                     encoding ='ISO-8859-1')

In [4]:
text = dftrain['text_final']

In [7]:
from nltk.tokenize import word_tokenize


In [None]:
#words = word_tokenize(text.lower())

In [12]:
words = [[word.lower() for word in t.split()] for t in text]

In [19]:
type(words)

list

In [21]:
#words_str = ''.join(words)
words_str = ''.join(str(e) for e in words)

In [22]:
type(words_str)

str

In [23]:
from collections import Counter, deque
from math import log

class SOPMI:
    def __init__(self, pos_seeds, neg_seeds, words_str, near = 10):
        self.w_count = Counter()
        self.p_count = dict()
        self.pos_seeds = set(pos_seeds)
        self.neg_seeds = set(neg_seeds)
        window = deque()
        window_size = near*2+1
        for word in words_str:
            window.append(word)
            if len(window)>window_size:
                window.popleft()
            elif len(window)<window_size:
                continue
            current_word = window[near]
            if current_word in self.pos_seeds or current_word in self.neg_seeds:
                self.w_count[current_word] += 1
                if current_word not in self.p_count:
                    self.p_count[current_word] = Counter()
                for window_word in window:
                    self.p_count[current_word][window_word] += 1

    def hits(self,word):
        return self.w_count[word]+1
    
    def hits_near(self,w1,w2):
        if w1 in self.p_count:
            return self.p_count[w1][w2]+1
        else:
            return 1
     
    def so(self, word):
        score = 0
        for pos_seed in self.pos_seeds:
            score += log(self.hits_near(pos_seed,word)/self.hits(pos_seed))
        for neg_seed in self.neg_seeds:
            score -= log(self.hits_near(neg_seed,word)/self.hits(neg_seed))
        return score   

In [24]:
pos_seeds = ['good','nice','excellent','positive','fortunate','correct','superior']
neg_seeds = ['bad','nasty','poor','negative','unfortunate','wrong','inferior']

In [25]:
print(words_str[:200])

['@user', '@url', '-', 'aw', ',', 'that', "'", 's', 'a', 'bummer', '.', 'you', 'shoulda', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', '.']['is', 'upset', 'that', 'he', 'can', 'not'


In [26]:
%%time
model = SOPMI(pos_seeds, neg_seeds, words_str)

In [27]:
from nltk.corpus import stopwords

sw = stopwords.words('english')

In [28]:
vocab = set(words_str)
so_values = [(word, model.so(word)) for word in vocab if word not in pos_seeds and word not in neg_seeds and word not in sw]
so_values.sort(key = lambda x: x[1])

In [29]:
so_values[:50]

[('r', 0.0),
 ('k', 0.0),
 ('"', 0.0),
 ('½', 0.0),
 ('w', 0.0),
 ('=', 0.0),
 ('±', 0.0),
 ('µ', 0.0),
 (';', 0.0),
 ('n', 0.0),
 ('®', 0.0),
 ('+', 0.0),
 ('1', 0.0),
 ('*', 0.0),
 ('8', 0.0),
 ('§', 0.0),
 ('¨', 0.0),
 ('¿', 0.0),
 ('¼', 0.0),
 ('¤', 0.0),
 ('e', 0.0),
 ('j', 0.0),
 (':', 0.0),
 ('c', 0.0),
 ('¶', 0.0),
 ('´', 0.0),
 ('(', 0.0),
 ('p', 0.0),
 ('©', 0.0),
 ('¾', 0.0),
 ('¸', 0.0),
 ('¹', 0.0),
 ('·', 0.0),
 ('5', 0.0),
 ('$', 0.0),
 ('\\', 0.0),
 ('¦', 0.0),
 ('x', 0.0),
 ('¯', 0.0),
 ('³', 0.0),
 (')', 0.0),
 ('g', 0.0),
 ('h', 0.0),
 ('!', 0.0),
 ("'", 0.0),
 ('¢', 0.0),
 ('@', 0.0),
 ('7', 0.0),
 (' ', 0.0),
 ('¡', 0.0)]

In [30]:
so_values[-50:]

[('!', 0.0),
 ("'", 0.0),
 ('¢', 0.0),
 ('@', 0.0),
 ('7', 0.0),
 (' ', 0.0),
 ('¡', 0.0),
 ('.', 0.0),
 ('<', 0.0),
 ('l', 0.0),
 ('£', 0.0),
 (']', 0.0),
 ('|', 0.0),
 ('}', 0.0),
 ('q', 0.0),
 ('²', 0.0),
 ('&', 0.0),
 ('%', 0.0),
 ('°', 0.0),
 ('«', 0.0),
 ('4', 0.0),
 ('#', 0.0),
 ('~', 0.0),
 ('º', 0.0),
 ('-', 0.0),
 ('{', 0.0),
 ('_', 0.0),
 ('/', 0.0),
 ('â', 0.0),
 ('¬', 0.0),
 ('`', 0.0),
 ('6', 0.0),
 ('>', 0.0),
 ('ª', 0.0),
 ('z', 0.0),
 ('[', 0.0),
 ('»', 0.0),
 ('u', 0.0),
 ('?', 0.0),
 ('b', 0.0),
 ('v', 0.0),
 ('^', 0.0),
 (',', 0.0),
 ('9', 0.0),
 ('3', 0.0),
 ('2', 0.0),
 ('¥', 0.0),
 ('ã', 0.0),
 ('0', 0.0),
 ('f', 0.0)]

In [31]:
so_values

[('r', 0.0),
 ('k', 0.0),
 ('"', 0.0),
 ('½', 0.0),
 ('w', 0.0),
 ('=', 0.0),
 ('±', 0.0),
 ('µ', 0.0),
 (';', 0.0),
 ('n', 0.0),
 ('®', 0.0),
 ('+', 0.0),
 ('1', 0.0),
 ('*', 0.0),
 ('8', 0.0),
 ('§', 0.0),
 ('¨', 0.0),
 ('¿', 0.0),
 ('¼', 0.0),
 ('¤', 0.0),
 ('e', 0.0),
 ('j', 0.0),
 (':', 0.0),
 ('c', 0.0),
 ('¶', 0.0),
 ('´', 0.0),
 ('(', 0.0),
 ('p', 0.0),
 ('©', 0.0),
 ('¾', 0.0),
 ('¸', 0.0),
 ('¹', 0.0),
 ('·', 0.0),
 ('5', 0.0),
 ('$', 0.0),
 ('\\', 0.0),
 ('¦', 0.0),
 ('x', 0.0),
 ('¯', 0.0),
 ('³', 0.0),
 (')', 0.0),
 ('g', 0.0),
 ('h', 0.0),
 ('!', 0.0),
 ("'", 0.0),
 ('¢', 0.0),
 ('@', 0.0),
 ('7', 0.0),
 (' ', 0.0),
 ('¡', 0.0),
 ('.', 0.0),
 ('<', 0.0),
 ('l', 0.0),
 ('£', 0.0),
 (']', 0.0),
 ('|', 0.0),
 ('}', 0.0),
 ('q', 0.0),
 ('²', 0.0),
 ('&', 0.0),
 ('%', 0.0),
 ('°', 0.0),
 ('«', 0.0),
 ('4', 0.0),
 ('#', 0.0),
 ('~', 0.0),
 ('º', 0.0),
 ('-', 0.0),
 ('{', 0.0),
 ('_', 0.0),
 ('/', 0.0),
 ('â', 0.0),
 ('¬', 0.0),
 ('`', 0.0),
 ('6', 0.0),
 ('>', 0.0),
 ('ª', 0.0)