In [27]:
import pandas as pd
from math import log
from nltk import word_tokenize
import string as string

In [28]:
data = pd.read_csv("sentiment140_sample1.csv", header = None, encoding = "ISO-8859-1")
data.columns = ["polarity","tweetId","date","query","username","tweetText"]

In [37]:
#Q1 Frecuency counts
def get_counts(data):
    counts = {"pos":{}, "neg":{}}
    pos = {}
    neg = {}
    dataPositive = list(data.groupby('polarity').get_group(4)['tweetText'])
    dataNegative = list(data.groupby('polarity').get_group(0)['tweetText'])
    # Count for positive polarity
    for x in dataPositive:
        for word in word_tokenize(x):
            if word.isalpha():
                word = word.lower()
                if word in pos:
                    pos[word] += 1
                else:
                    pos[word] = 1
    # Count for negative polarity
    for x in dataNegative:
        for word in word_tokenize(x):
            if word.isalpha():
                word = word.lower()
                if word in neg:
                    neg[word] += 1
                else:
                    neg[word] = 1
    # Assign positive and negative dictionary to counts dict
    counts["pos"] = pos
    counts["neg"] = neg
    return counts

In [38]:
counts = get_counts(data)
print(counts["pos"]["happy"])
print(counts["neg"]["hate"])

124
99


In [39]:
# Q2 Calculating Probability of Word
def get_word_prob(counts,word,polarity):
    if word in counts[polarity]:
        return counts[polarity][word]/sum([x for x in counts[polarity].values()])
    else:
        return 0

In [40]:
print(get_word_prob(counts, "great", "pos"))
print(get_word_prob(counts, "glad", "neg"))

0.002558903051371186
0.00012162305973212521


In [41]:
# Q3 Calculating Log odds ratio of Word
def log_odds_ratio(counts,word,polarity):
    opposite_polarity = ""
    
    if polarity=="pos":
        opposite_polarity = "neg"
    if polarity=="neg":
        opposite_polarity = "pos"
    
    current_prob = get_word_prob(counts,word,polarity)
    opposite_prob = get_word_prob(counts,word,opposite_polarity)
    
    if opposite_prob > 0:
        return log(current_prob/opposite_prob)
    else:
        return 1000

In [42]:
print(log_odds_ratio(counts, "great", "pos"))
print(log_odds_ratio(counts, "the", "neg"))

1.275701302546381
-0.09165794206814183


In [22]:
def log_odds(x):
    return x[1]

In [67]:
# Q4 Sorting log odds ratios
def sort_pos_words(data):
    counts = get_counts(data)
    totalwords = 0
    lst = []
    
    # create list of tuples
    for word, count in counts["pos"].items():
        if word in counts["pos"]:
            totalwords = totalwords + counts["pos"][word]
        if word in counts["neg"]:
            totalwords = totalwords + counts["neg"][word]
        if totalwords >= 10:
            lst.append(tuple((word, log_odds_ratio(counts,word,"pos"))))
            
    # sort based on polarity
    lst.sort(key = log_odds,reverse=True)
    return lst

In [69]:
lst = sort_pos_words(data)
print(lst[:10])
print(lst[-10:])

[('strokes', 1000), ('experienced', 1000), ('soma', 1000), ('maddy', 1000), ('sounding', 1000), ('boheme', 1000), ('nsfw', 1000), ('wntd', 1000), ('bikeindia', 1000), ('bunnies', 1000)]
[('expensive', -2.508004655425329), ('bus', -2.5821126275790505), ('hates', -2.651105499066002), ('throat', -2.651105499066002), ('tummy', -2.715644020203573), ('sad', -2.8052561788932606), ('missing', -2.987577735687215), ('died', -3.121109128311738), ('headache', -3.4694158225799536), ('hurts', -3.8348755960744185)]
