In [1]:
import pandas as pd

review_df = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=3)

review_df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [2]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [3]:
import re

review_df['review'] = review_df['review'].str.replace('<br />', ' ')
review_df['review'] = review_df['review'].apply(lambda x: re.sub('[^a-zA-z]', ' ', x))

In [4]:
from sklearn.model_selection import train_test_split

class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.3, random_state=156)

X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C=10))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]

print(accuracy_score(y_test, pred))
print(roc_auc_score(y_test, pred_probs))


In [None]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C=10))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]

print(accuracy_score(y_test, pred))
print(roc_auc_score(y_test, pred_probs))


0.8934666666666666
0.9597850273161849


In [None]:
import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/f2hard3/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/f2hard3/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/f2hard3/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/f2hard3/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/f2hard3/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /Users/f2hard3/nl

True

In [8]:
from nltk.corpus import wordnet as wn

term = 'present'

synsets =  wn.synsets(term)

print('type(synsets): ', type(synsets))
print('len(synsets): ', len(synsets))
print(synsets)

type(synsets):  <class 'list'>
len(synsets):  18
[Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [10]:
for synset in synsets:
    print('synset.name(): ', synset.name())
    print('synset.lexname(): ', synset.lexname())
    print('synset.definition(): ', synset.definition())
    print('synset.lemma_names(): ', synset.lemma_names())

synset.name():  present.n.01
synset.lexname():  noun.time
synset.definition():  the period of time that is happening now; any continuous stretch of time including the moment of speech
synset.lemma_names():  ['present', 'nowadays']
synset.name():  present.n.02
synset.lexname():  noun.possession
synset.definition():  something presented as a gift
synset.lemma_names():  ['present']
synset.name():  present.n.03
synset.lexname():  noun.communication
synset.definition():  a verb tense that expresses actions or states at the time of speaking
synset.lemma_names():  ['present', 'present_tense']
synset.name():  show.v.01
synset.lexname():  verb.perception
synset.definition():  give an exhibition of to an interested audience
synset.lemma_names():  ['show', 'demo', 'exhibit', 'present', 'demonstrate']
synset.name():  present.v.02
synset.lexname():  verb.communication
synset.definition():  bring forward and present to the mind
synset.lemma_names():  ['present', 'represent', 'lay_out']
synset.name()

In [11]:
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

entities = [tree, lion, tiger, cat, dog]
similarities = []

entity_names = [entity.name().split('.')[0] for entity in entities]

for entity in entities:
    similarity = [round(entity.path_similarity(compared_entity), 2) for compared_entity in entities]
    similarities.append(similarity)

similarity_df = pd.DataFrame(similarities, columns=entity_names, index=entity_names)
similarity_df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


In [15]:
import nltk
from nltk.corpus import sentiwordnet as swn

senti_synsets = list(swn.senti_synsets('slow'))
print('type(senti_synsets):', type(senti_synsets))
print('len(senti_synsets): ', len(senti_synsets))
print('senti_synsets:', senti_synsets)

type(senti_synsets): <class 'list'>
len(senti_synsets):  11
senti_synsets: [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [16]:
father = swn.senti_synset('father.n.01')

print('father.pos_score(): ', father.pos_score())
print('father.neg_score(): ', father.neg_score())
print('father.obj_score(): ', father.obj_score())
print()

fabulous = swn.senti_synset('fabulous.a.01')

print('fabulous.pos_score(): ', fabulous.pos_score())
print('fabulous.neg_score(): ', fabulous.neg_score())
print('fabulous.obj_score(): ', fabulous.obj_score())

father.pos_score():  0.0
father.neg_score():  0.0
father.obj_score():  1.0

fabulous.pos_score():  0.875
fabulous.neg_score():  0.125
fabulous.obj_score():  0.0


In [17]:
def penn_to_wn(tag):
    if tag.startswith('J'): return wn.ADJ
    elif tag.startswith('N'): return wn.NOUN
    elif tag.startswith('R'): return wn.ADV
    elif tag.startswith('V'): return wn.VERB

In [18]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

def swn_polarity(text):
    sentiment = 0.0
    tokens_count = 0

    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)

    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))

        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)            
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)            
            if not lemma:
                continue
            
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue

            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_count += 1
    
    if not tokens_count: 
        return 0

    if sentiment >= 0:
        return 1
    
    return 0


In [19]:
review_df['preds'] = review_df['review'].apply(lambda x: swn_polarity(x))
y_target = review_df['sentiment'].values

preds = review_df['preds'].values

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, roc_auc_score

print('confusion_matrix(y_target, preds): \n', confusion_matrix(y_target, preds))
print('accuracy_score(y_target, preds): ', accuracy_score(y_target, preds))
print('precision_score(y_target, preds): ', precision_score(y_target, preds))
print('recall_score(y_target, preds): ', recall_score(y_target, preds))

confusion_matrix(y_target, preds): 
 [[7685 4815]
 [3621 8879]]
accuracy_score(y_target, preds):  0.66256
precision_score(y_target, preds):  0.6483861545202279
recall_score(y_target, preds):  0.71032


In [22]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][0])
print(senti_scores)

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}


In [23]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)

    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0

    return final_sentiment

review_df['vader_preds'] = review_df['review'].apply(lambda x: vader_polarity(x))
y_target = review_df['sentiment'].values
vader_preds = review_df['vader_preds'].values

print('confusion_matrix(y_target, vader_preds): \n', confusion_matrix(y_target, vader_preds))
print('accuracy_score(y_target, vader_preds): ', accuracy_score(y_target, vader_preds))
print('precision_score(y_target, vader_preds): ', precision_score(y_target, vader_preds))
print('recall_score(y_target, vader_preds): ', recall_score(y_target, vader_preds))

confusion_matrix(y_target, vader_preds): 
 [[ 6762  5738]
 [ 1842 10658]]
accuracy_score(y_target, vader_preds):  0.6968
precision_score(y_target, vader_preds):  0.6500365942912906
recall_score(y_target, vader_preds):  0.85264
