In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from time import time

In [2]:
columns = ['','text_no_tag', 'polarity', 'length']

dftrain = pd.read_csv('csv/finaltrain.csv',
                     header = 0, 
                     usecols = [1,2], 
                     names=columns,
                     encoding ='ISO-8859-1')

In [3]:
dftrain.head()

Unnamed: 0,text_no_tag,polarity
0,"@USER @URL - aw , that ' s a bummer . you sho...",0
1,is upset that he can not update his facebook b...,0
2,@USER i dived many times for the ball . manage...,0
3,my whole body feels itchy and like its on fire,0
4,"@USER no , it ' s not behaving at all . i am m...",0


In [4]:
from sklearn.utils import shuffle

In [5]:
dftrain = shuffle(dftrain)
dftrain = dftrain[:20000]

In [6]:
tweets = []
for t in dftrain['text_no_tag']:
    tweets.append(t)

In [8]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

words = tokenizer.tokenize(str(tweets))

In [10]:
from collections import Counter, deque
from math import log

class SOPMI:
    def __init__(self, pos_seeds, neg_seeds, words, near = 5):
        self.w_count = Counter()
        self.p_count = dict()
        self.pos_seeds = set(pos_seeds)
        self.neg_seeds = set(neg_seeds)
        window = deque()
        window_size = near*2+1
        for word in words:
            window.append(word)
            if len(window)>window_size:
                window.popleft()
            elif len(window)<window_size:
                continue
            current_word = window[near]
            if current_word in self.pos_seeds or current_word in self.neg_seeds:
                self.w_count[current_word] += 1
                if current_word not in self.p_count:
                    self.p_count[current_word] = Counter()
                for window_word in window:
                    self.p_count[current_word][window_word] += 1

    def hits(self,word):
        return self.w_count[word]+1
    
    def hits_near(self,w1,w2):
        if w1 in self.p_count:
            return self.p_count[w1][w2]+1
        else:
            return 1
     
    def so(self, word):
        score = 0
        for pos_seed in self.pos_seeds:
            score += log(self.hits_near(pos_seed,word)/self.hits(pos_seed))
        for neg_seed in self.neg_seeds:
            score -= log(self.hits_near(neg_seed,word)/self.hits(neg_seed))
        return score   

In [12]:
pos_seeds = ['good','nice','excellent','positive','fortunate','correct','superior']
neg_seeds = ['bad','nasty','poor','negative','unfortunate','wrong','inferior']
print(pos_seeds)

['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']


In [13]:
model = SOPMI(pos_seeds, neg_seeds, words)

In [16]:
from nltk.corpus import stopwords

sw = stopwords.words('english')

In [17]:
vocab = set(words)
so_values = [(word, model.so(word)) for word in vocab if word not in pos_seeds and word not in neg_seeds and word not in sw]
so_values.sort(key = lambda x: x[1])

In [20]:
len(so_values)

18902

In [21]:
%%time
#[x[0] for x in my_tuples]
scores = []
for t in dftrain['text_no_tag']:
    score = 0
    for word in t.split():
        for i in range(0,len(so_values)):
            if word == so_values[i][0]:
                #print(so_values[i][1])
                score = score + so_values[i][1]
    scores.append(score)

CPU times: user 16min 45s, sys: 1.54 s, total: 16min 46s
Wall time: 16min 48s


In [23]:
len(scores)

20000

In [24]:
dftrain['SO_PMI_score'] = scores

In [25]:
dftrain.head()

Unnamed: 0,text_no_tag,polarity,SO_PMI_score
235076,@USER i have a guinea pig too ! i do not even ...,0,22.492881
1274244,@USER happy birthday,4,5.0272
820212,"twitter fam wat up , had a great time shoppin ...",4,19.979527
1490408,"watching jimmy kimmel live ' s nba special , s...",4,35.584079
374153,bedd . wish i could curl up in someone special...,0,17.071806


In [27]:
dftrain.to_csv('df_20000_SOPMI.csv')

In [28]:
dftrain[dftrain['SO_PMI_score']>0].head()

Unnamed: 0,text_no_tag,polarity,SO_PMI_score
235076,@USER i have a guinea pig too ! i do not even ...,0,22.492881
1274244,@USER happy birthday,4,5.0272
820212,"twitter fam wat up , had a great time shoppin ...",4,19.979527
1490408,"watching jimmy kimmel live ' s nba special , s...",4,35.584079
374153,bedd . wish i could curl up in someone special...,0,17.071806


In [31]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))

In [32]:
x = dftrain[['SO_PMI_score']].values.astype(float)

In [33]:
SOPMI_scaled = preprocessing.minmax_scale(dftrain[['SO_PMI_score']].values.astype(float), feature_range=(-1, 1))

In [34]:
dftrain['SO_PMI_scaled'] = SOPMI_scaled

In [35]:
dftrain['polarity'].replace(to_replace=[4],value=1,inplace=True)

In [36]:
dftrain.head()

Unnamed: 0,text_no_tag,polarity,SO_PMI_score,SO_PMI_scaled
235076,@USER i have a guinea pig too ! i do not even ...,0,22.492881,-0.184918
1274244,@USER happy birthday,1,5.0272,-0.787531
820212,"twitter fam wat up , had a great time shoppin ...",1,19.979527,-0.271635
1490408,"watching jimmy kimmel live ' s nba special , s...",1,35.584079,0.266764
374153,bedd . wish i could curl up in someone special...,0,17.071806,-0.371959


In [37]:
dftrain['SO_PMI_scaled'].min()

-1.0

In [39]:
dftrain['SO_PMI_sentiment'] = dftrain['SO_PMI_scaled'].apply(lambda x: 0 if x<0 else 1)

In [40]:
dftrain.head()

Unnamed: 0,text_no_tag,polarity,SO_PMI_score,SO_PMI_scaled,SO_PMI_sentiment
235076,@USER i have a guinea pig too ! i do not even ...,0,22.492881,-0.184918,0
1274244,@USER happy birthday,1,5.0272,-0.787531,0
820212,"twitter fam wat up , had a great time shoppin ...",1,19.979527,-0.271635,0
1490408,"watching jimmy kimmel live ' s nba special , s...",1,35.584079,0.266764,1
374153,bedd . wish i could curl up in someone special...,0,17.071806,-0.371959,0


In [42]:
dftrain.to_csv('SO_PMI_sentiment.csv')

In [43]:
print("gold negative tweets:", len(dftrain[dftrain['polarity']==0]))
print("gold positive tweets:", len(dftrain[dftrain['polarity']==1]))
print("SO_PMI_sentiment negative tweets:", len(dftrain[dftrain['SO_PMI_sentiment']==0]))
print("SO_PMI_sentiment positive tweets:", len(dftrain[dftrain['SO_PMI_sentiment']==1]))


gold negative tweets: 9964
gold positive tweets: 10036
SO_PMI_sentiment negative tweets: 18607
SO_PMI_sentiment positive tweets: 1393
