In [1]:
import os
import pandas as pd
import numpy as np
from nltk import pos_tag
import textblob as tb

from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from nltk.corpus import sentiwordnet as swn

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, LabelSet, FactorRange, HoverTool

output_notebook()

DATA_FOLDER = os.path.join(os.getcwd(), "../data")

eng_stopwords = stopwords.words('english')

def stem(doc):
    return (porter_stemmer.stem(token) for token in analyzer(doc) if token not in eng_stopwords)

def stem_re(doc):
    res = []
    doc = re.sub(rex, "", doc)
    for token in analyzer(doc):
            if (token not in eng_stopwords):
                res.append(porter_stemmer.stem(token))
    return res

ModuleNotFoundError: No module named 'textblob'

In [None]:
train_fp = os.path.join(DATA_FOLDER, "csv/train.csv")
reviews = pd.read_csv(train_fp, index_col="review_id", usecols=['text', 'stars', 'review_id'])
reviews = reviews.sample(10000)

In [4]:
def remove_stopwords(doc):
    res = ""
    tokens = doc.split(" ")
    for token in tokens:
        if token not in eng_stopwords:
            token = token.lower()
            res = res + token + " "
    return res

In [5]:
reviews['text'] = reviews['text'].apply(remove_stopwords)

In [6]:
reviews['pos'] = reviews['text'].apply(lambda x : tb.TextBlob(x).tags)

In [7]:
def convert_tags(tags):
    tags_map = {
        'JJ':'a','JJR':'a','JJS':'a',
        'MD':'v','POS':'v','PR':'v','VB':'v','VBD':'v','VBG':'v','VBN':'v','VBP':'v','VBZ':'v',
        'NN':'n','NNS':'n','NNP':'n','NNPS':'n','PRP':'n','PRP$':'n',
        'RB':'r','RBR':'r','RBS':'r'
    }
    ret = []
    for t in tags:
        if(t[1] in tags_map.keys()):
            ret.append( t[0].__str__() + "." + str(tags_map[t[1]]) + ".1" )
    return ret

In [8]:
reviews['pos'] = reviews['pos'].apply(convert_tags)

In [9]:
def dummy_analyzer(doc):
    return doc

In [10]:
cv = CountVectorizer(analyzer=dummy_analyzer)
bow = cv.fit_transform(reviews['pos'])

In [11]:
word_freq = pd.DataFrame(columns=['word', 'freq'])

In [12]:
freqs = bow.sum(axis=0)

In [13]:
for k in cv.vocabulary_:
    tmp_df = pd.DataFrame({'word':[k], 'freq':[freqs[0, cv.vocabulary_[k]-1]]})
    word_freq = word_freq.append(tmp_df)

In [14]:
def get_pos_score(word, pos=True):
    if(pos):
        try:
            return swn.senti_synset(word).pos_score()
        except:
            return np.nan
    else:
        try:
            return swn.senti_synset(word).neg_score()
        except:
            return np.nan
        
def negate(x):
    if(x > 0):
        return -x
    else:
        return x

In [15]:
word_freq['pos_score'] = word_freq['word'].apply(lambda x : get_pos_score(x, True))
word_freq['neg_score'] = word_freq['word'].apply(lambda x : get_pos_score(x, False))

In [16]:
word_freq = word_freq.dropna(axis=0, how='any')

In [17]:
word_freq = word_freq[(word_freq['pos_score'] != 0) | (word_freq['neg_score'] != 0)]

In [18]:
word_freq = word_freq.sort_values(by=['freq', 'pos_score', 'neg_score'], ascending=False)

In [19]:
word_freq['neg_score'] = word_freq['neg_score'].apply(negate)

In [24]:
def plot_pos_neg_scores(df):
    
    fig_1 = figure(y_range=df['word'], x_range=(-1, 1))
    fig_1.hbar(y=df['word'], height=0.25, right=df['pos_score'], color='blue')
    fig_1.hbar(y=df['word'], height=0.25, right=df['neg_score'], color='red')
    
    return fig_1

In [25]:
to_plot_df = word_freq.iloc[0:35, :].copy()

In [26]:
fig = plot_pos_neg_scores(to_plot_df)

In [27]:
show(fig)