#### Reference: https://nlpforhackers.io/sentiment-analysis-intro/

In [1]:
import pandas as pd       

In [2]:
data = pd.read_csv("labeledTrainData_sample.tsv", header=0, delimiter="\t", quoting=3)
# movie reviews
data.shape # (5003, 3) 

(5003, 3)

In [3]:
data.head()

Unnamed: 0,id,sentiment,review
0,"""525_1""",0,"""I looked over the other comments and was thor..."
1,"""6846_1""",0,"""i am a big fan of karishma Kapoor and Govinda..."
2,"""2828_3""",0,"""There seem to have been any number of films l..."
3,"""3862_4""",0,"""I just watched it. A couple of laughs, but no..."
4,"""674_10""",1,"""While to most people watching the movie, this..."


#### Seperate train and test data

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train, test = train_test_split(data, test_size=0.2)

#### mining

In [22]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

lemmatizer = WordNetLemmatizer()

In [23]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [30]:
def clean_text(text):
    text = text.replace("<br />", " ")
    #text = text.decode("utf-8")
 
    return text

In [31]:
def swn_polarity(text):
    """
    Return a sentiment polarity: 0 = negative, 1 = positive
    """
 
    sentiment = 0.0
    tokens_count = 0
 
    text = clean_text(text)
 
 
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
 
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
 
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
 
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
 
    # judgment call ? Default to positive or negative
    if not tokens_count:
        return 0
 
    # sum greater than 0 => positive sentiment
    if sentiment >= 0:
        return 1
 
    # negative sentiment
    return 0

In [36]:
train.columns

Index(['id', 'sentiment', 'review'], dtype='object')

In [35]:
# column locations - 2: review, 1: positive/negative
for i in range(10):
    print(swn_polarity(train.iloc[i,2]),train.iloc[i,1])

1 1
0 0
0 1
1 1
0 0
0 0
1 1
0 0
0 1
1 1


In [37]:
from sklearn.metrics import accuracy_score

pred_y = [swn_polarity(text) for text in test['review']]

In [40]:
accuracy_score(test['sentiment'], pred_y)

0.6423576423576424