In [224]:
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
import re
from collections import Counter
from nltk.corpus import stopwords 
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from xlwt import Workbook 
from math import log
import csv

In [225]:
reviews = pd.read_csv('train.csv', encoding = "ISO-8859-1") 
all_reviews = reviews['text']
positive_reviews = reviews[reviews['class']=='positive']['text']
negative_reviews = reviews[reviews['class']=='negative']['text']
all_sum = len(all_reviews)
positive_sum = len(positive_reviews)
negative_sum = len(negative_reviews)

In [226]:
def get_pos(word):
    w_synsets = wordnet.synsets(word)

    pos_counts = Counter()
    pos_counts["n"] = len([item for item in w_synsets if item.pos()=="n"])
    pos_counts["v"] = len([item for item in w_synsets if item.pos()=="v"])
    pos_counts["a"] = len([item for item in w_synsets if item.pos()=="a"])
    pos_counts["r"] = len([item for item in w_synsets if item.pos()=="r"])
    
    most_common_pos_list = pos_counts.most_common(3)
    return most_common_pos_list[0][0]

In [227]:
stop_words = set(stopwords.words('english')) 
all_counter = Counter()

for review in all_reviews:
    words = re.sub(r'[^\w ]+', ' ', review).strip().lower().split()
    words = set([w for w in words if w not in stop_words and len(w) > 2])
    # counter in the loop, count once per instance
    for word in words:
        all_counter[word] += 1

In [228]:
positive_counter = Counter()

for review in positive_reviews:
    words = re.sub(r'[^\w ]+', ' ', review).strip().lower().split()
    words = set([w for w in words if w not in stop_words and len(w) > 2])
    for word in words:
        positive_counter[word] += 1

In [229]:
negative_counter = Counter()

for review in negative_reviews:
    words = re.sub(r'[^\w ]+', ' ', review).strip().lower().split()
    words = set([w for w in words if w not in stop_words and len(w) > 2])
    for word in words:
        negative_counter[word] += 1

In [236]:
all_positive_mi = []
all_negative_mi = []

for word in all_counter:
    word_positive = positive_counter[word] 
    positive_no_word = positive_sum - positive_counter[word]
    word_negative = negative_counter[word] 
    negative_no_word = negative_sum - negative_counter[word] 
    if word_positive != 0 and word_negative != 0:
        positive_mi = log(all_sum*word_positive/((word_positive+word_negative)*(word_positive+positive_no_word)),2)
        negative_mi = log(all_sum*word_negative/((word_negative+word_positive)*(word_negative+negative_no_word)),2)
    elif word_positive == 0:
        positive_mi = 0
    else:
        negative_mi = 0
    all_positive_mi.append([word, positive_mi, positive_counter[word]])
    all_negative_mi.append([word, negative_mi, negative_counter[word]])  
    
all_positive_mi = sorted(all_positive_mi, key=lambda elements: elements[1], reverse=True)
all_negative_mi = sorted(all_negative_mi, key=lambda elements: elements[1], reverse=True)
with open('positive_mi.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(["Word", "Mutual Information","Positive Counts"])
    for i in range(0, 399):
        w.writerow(all_positive_mi[i])
with open('negative_mi.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(["Word", "Mutual Information","Negative Counts"])
    for i in range(0, 399):
        w.writerow(all_negative_mi[i])

In [187]:
wb = Workbook() 
positive_sheet = wb.add_sheet('positive_sheet') 
positive_sheet.write(0, 0, 'Word')
positive_sheet.write(0, 1, 'Counts')
positive_sheet.write(0, 2, 'Tag')
positive_adj = wb.add_sheet('positive_adj')
positive_adj.write(0, 0, 'Word')
positive_adj.write(0, 1, 'Counts')
positive_verb = wb.add_sheet('positive_verb')
positive_verb.write(0, 0, 'Word')
positive_verb.write(0, 1, 'Counts')
positive_adv = wb.add_sheet('positive_adv')
positive_adv.write(0, 0, 'Word')
positive_adv.write(0, 1, 'Counts')
positive_noun = wb.add_sheet('positive_noun')
positive_noun.write(0, 0, 'Word')
positive_noun.write(0, 1, 'Counts')
i = 1
x = 1
y = 1
z = 1
t = 1

for word,counts in positive_counter.most_common():
    tag = get_pos(word)
    positive_sheet.write(i, 0, word)
    positive_sheet.write(i, 1, counts)
    positive_sheet.write(i, 2, tag)
    i += 1
    if tag == 'a':
        positive_adj.write(x, 0, word)
        positive_adj.write(x, 1, counts)
        x += 1
    if tag == 'v':
        positive_verb.write(y, 0, word)
        positive_verb.write(y, 1, counts)
        y += 1
    if tag == 'r':
        positive_adv.write(z, 0, word)
        positive_adv.write(z, 1, counts)
        z += 1
    if tag == 'n':
        positive_noun.write(t, 0, word)
        positive_noun.write(t, 1, counts)
        t += 1
wb.save('positive_words_extraction.xls')

In [188]:
wb = Workbook() 
negative_sheet = wb.add_sheet('negative_sheet') 
negative_sheet.write(0, 0, 'Word')
negative_sheet.write(0, 1, 'Counts')
negative_sheet.write(0, 2, 'Tag')
negative_adj = wb.add_sheet('negative_adj')
negative_adj.write(0, 0, 'Word')
negative_adj.write(0, 1, 'Counts')
negative_verb = wb.add_sheet('negative_verb')
negative_verb.write(0, 0, 'Word')
negative_verb.write(0, 1, 'Counts')
negative_adv = wb.add_sheet('negative_adv')
negative_adv.write(0, 0, 'Word')
negative_adv.write(0, 1, 'Counts')
negative_noun = wb.add_sheet('negative_noun')
negative_noun.write(0, 0, 'Word')
negative_noun.write(0, 1, 'Counts')
i = 1
x = 1
y = 1
z = 1
t = 1

for word,counts in negative_counter.most_common():
    tag = get_pos(word)
    negative_sheet.write(i, 0, word)
    negative_sheet.write(i, 1, counts)
    negative_sheet.write(i, 2, tag)
    i += 1
    if tag == 'a':
        negative_adj.write(x, 0, word)
        negative_adj.write(x, 1, counts)
        x += 1
    if tag == 'v':
        negative_verb.write(y, 0, word)
        negative_verb.write(y, 1, counts)
        y += 1
    if tag == 'r':
        negative_adv.write(z, 0, word)
        negative_adv.write(z, 1, counts)
        z += 1
    if tag == 'n':
        negative_noun.write(t, 0, word)
        negative_noun.write(t, 1, counts)
        t += 1
wb.save('negative_words_extraction.xls') 