In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [None]:
data=pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")
data.head(3)

In [None]:
sentiment_data = data[['reviews.rating', 'reviews.text', 'reviews.title']]
sentiment_data.head()

In [None]:
## prepare data
def sentiment_labeler(score):
    if (score==5) or (score==4):
        return "Positive"
    elif (score==3):
        return "Neutral"
    else:
        return "Negative"
    
sentiment_data["sentiment_label"]=sentiment_data["reviews.rating"].apply(sentiment_labeler)
sentiment_data.head()

### prepare train, test data


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split_data=StratifiedShuffleSplit(n_splits=5, test_size=0.2)
for tr_indx, ts_indx in split_data.split(sentiment_data, sentiment_data["reviews.rating"]):
    train_data=sentiment_data.reindex(tr_indx)
    test_data=sentiment_data.reindex(ts_indx)

In [None]:
## set up stanfordcorenlp

In [None]:
from pycorenlp import StanfordCoreNLP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import numpy as np
import string

In [None]:
## helper function to extract pos tag features

def extract_POS(statements):
    print('Extracting POS Tags')
    pos_tags = POS_tagging(statements,return_word_tag_pairs=False)
    bigrams_pos = POS_groupping(pos_tags, grams=2)
    trigrams_pos =POS_groupping(pos_tags, grams=3)
    print('Finished')
    return pos_tags,bigrams_pos,trigrams_pos

##
def POS_tagging(statements, return_word_tag_pairs = False):
    core_nlp = StanfordCoreNLP('http://localhost:9000')
    print("NLP_Task ready to use.")
    POS_tags = list()
    for statement in statements:
        statement_tags = list()
        annotations = core_nlp.annotate(statement, properties={
            'annotators': 'tokenize,pos',
            'outputFormat': 'json'
            })
        for output in annotations['sentences']:
            statement_tags.append('<s>')
            previous = ''
            for token in output['tokens']:
                if return_word_tag_pairs:
                    statement_tags.append(token['word']+'/'+token['pos'])
                else:
                    statement_tags.append(token['pos'])

        POS_tags.append(statement_tags)
    return POS_tags

## 
def POS_groupping(sentences_pos,grams=1):
    result = list()
    for sentence_tags in sentences_pos:
        tag_group = list()
        for index, each_tag in enumerate(sentence_tags):
            if index < len(sentence_tags)-grams and len(sentence_tags)>=grams:
                format_str = str()
                for i in range(0,grams):
                    format_str += sentence_tags[index+i]
                    if i<grams-1:
                        format_str += '_'
                tag_group.append(format_str)
        result.append(tag_group)
    return result

##
def RemoveConsecutiveTags(list_to_remove, postags,ignore_punctuation=False):
    withoutConsecutiveTags = list()
    for each_tag in postags:
        removed = list()
        previous = ''
        for tt in each_tag:
            if tt != previous:
                if not ignore_punctuation:
                    removed.append(tt)
                elif tt not in string.punctuation:
                    removed.append(tt)
                previous = tt
            elif tt not in list_to_remove:
                removed.append(tt)
                previous = tt
        withoutConsecutiveTags.append(removed)
    return withoutConsecutiveTags

In [None]:
## extracting pos-tag features
unigram_pos_tr, bigrams_pos_tr, trigram_pos_tr = extract_POS(statements=train_data['reviews.text'])
unigram_pos_ts, bigrams_pos_ts, trigram_pos_ts = extract_POS(statements=test_data['reviews.text'])

In [None]:
## remove duplicated pos tag

# For review data
list_to_remove = ['NNP','CD']

removed_pos_tr =RemoveConsecutiveTags(list_to_remove,unigram_pos)
removed_pos_bigrams_tr = POS_groupping(grams=2,sentences_pos=removed_pos_tr)
removed_pos_trigrams_tr = POS_groupping(grams=3,sentences_pos=removed_pos_tr)

removed_pos_ts =RemoveConsecutiveTags(list_to_remove,unigram_pos)
removed_pos_bigrams_ts = POS_groupping(grams=2,sentences_pos=removed_pos_ts)
removed_pos_trigrams_ts = POS_groupping(grams=3,sentences_pos=removed_pos_ts)

In [None]:
sentiment_feats= pd.DataFrame()
sentiment_feats['reviews'] = sentiment_data['reviews.text']

list_to_remove = ['NNP','CD']
sentiment_feats['pos_unig_tr'] = [" ".join(x).replace('<s>','').replace('$','dollar').strip() for x in removed_pos_tr]
sentiment_feats['pos_big_tr'] = [" ".join(x).replace('$','dollar').strip() for x in removed_pos_bigrams_tr]
sentiment_feats['pos_trig_tr'] = [" ".join(x).replace('$','dollar').strip() for x in removed_pos_trigrams_tr]
sentiment_feats.head()

In [None]:
###
def GetFeaturesFromPOS(training_data, user_defined_vocabulary=None):
    user_defined_vocabulary = [x.lower().replace('$','dollar') for x in user_defined_vocabulary]

    # making string of the data
    training_str = [" ".join(x) for x in training_data]

    #replace $ by dollar
    training_str = [x.replace('$', 'dollar').replace('<s>','sos') for x in training_str]

    # features using binary iformation
    oneHotVectorizer = CountVectorizer(vocabulary=user_defined_vocabulary,binary=True)
    tr_onehot = oneHotVectorizer.fit_transform(training_str).toarray()
    print(oneHotVectorizer.vocabulary_)

    # features using no-binary information (counting)
    countVectorizer = CountVectorizer(vocabulary=user_defined_vocabulary,binary=True)
    tr_count = countVectorizer.fit_transform(training_str).toarray()

    # features using tf-idf vectors
    tfIdfVectorizer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
    tr_tfidf = tfIdfVectorizer.fit_transform(tr_count)

    return tr_onehot, tr_count, tr_tfidf

In [None]:
pos_relevant_unigrams =  ['VBZ', 'DT', 'NNPS', 'VBP', 'JJ', 'IN', 'WRB', 'VBD', 'PRP', 'RP', 'WDT', 'VB', 'NNP', 'VBG', 'PRP$', 'VBN', 'CD', 'RB', 'WP', 'JJS', 'JJR', 'EX', 'RBS', 'FW', 'LS'] 
amazonRev_onehot_unigram_tr, amazonRev_count_unigram_tr, amazonRev_tfidf_unigram_tr = GetFeaturesFromPOS(training_data=removed_pos_tr, user_defined_vocabulary=pos_relevant_unigrams)

pos_relevant_bigrams = ['NNPS_VBP', 'VB_NNP', 'IN_DT', 'VB_JJ', 'JJ_CD', 'CD_NNS', 'DT_JJS', 'JJR_IN', 'IN_CD', 'CC_IN', 'RB_VBD', 'CD_NN', 'NN_TO', 'JJR_JJ', 'VB_CD'] 
amazonRev_onehot_bigram_tr, amazonRev_count_bigram_tr, amazonRev_tfidf_bigram_tr = GetFeaturesFromPOS(training_data=removed_pos_bigrams_tr, user_defined_vocabulary=pos_relevant_bigrams)

pos_relevant_trigrams = ['VBD_VBN_IN', 'IN_DT_JJ', 'CD_NN_IN', 'IN_CD_NNS', 'IN_DT_NN', 'DT_JJ_CD', 'MD_VB_IN', 'JJS_JJ_NN', 'CC_JJ_NNS', 'JJ_NNS_VBP', 'VBP_CD_NN', 'sos_JJR_IN', 'IN_DT_NNS','JJ_NN_MD']
amazonRev_onehot_trigram_tr, amazonRev_count_trigram_tr, amazonRev_tfidf_trigram_tr = GetFeaturesFromPOS(training_data=removed_pos_trigrams_tr, user_defined_vocabulary=pos_relevant_trigrams)

In [None]:
sum(amazonRev_onehot_unigram)

In [None]:
sum(amazonRev_onehot_bigram)

In [None]:
sum(amazonRev_onehot_trigram)

In [None]:
sentiment_feats['pos_unigrams_1hot_tr'] =  [str(x) for x in amazonRev_onehot_unigram_tr]
sentiment_feats['pos_bigrams_1hot_tr'] = [str(x) for x in amazonRev_onehot_bigram_tr]
sentiment_feats['pos_trigrams_1hot_tr'] = [str(x) for x in amazonRev_onehot_trigram_tr]

sentiment_feats['pos_unigrams_count_tr'] =  [str(x) for x in amazonRev_count_unigram_tr]
sentiment_feats['pos_bigrams_count_tr'] = [str(x) for x in amazonRev_count_bigram_tr]
sentiment_feats['pos_trigrams_count_tr'] = [str(x) for x in amazonRev_count_trigram_tr]

sentiment_feats['pos_unigrams_tfidf_tr'] =  [str(x) for x in amazonRev_tfidf_unigram_tr]
sentiment_feats['pos_bigrams_tfidf_tr'] = [str(x) for x in amazonRev_tfidf_bigram_tr]
sentiment_feats['pos_trigrams_tfidf_tr'] = [str(x) for x in amazonRev_tfidf_trigram_tr]

sentiment_feats.head()

In [None]:
### train ML classifier

In [None]:
## use different features

unigram_pos_str = [" ".join(x) for x in unigram_pos]
bigram_pos_str = [" ".join(x) for x in bigrams_pos]
trigram_pos_str = [" ".join(x) for x in trigram_pos]

cv_uni = CountVectorizer()
pos_uni_feats = cv_uni.fit_transform(unigram_pos_str).toarray()

## Sentiment Analysis with TextBlob

In [None]:
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor

import nltk
nltk.download('movie_reviews')
nltk.download('conll2000')

In [None]:
def extractTextBlobFeatures(corpus):
    extractor = ConllExtractor()
    text_blob_features = np.zeros((len(corpus),4))
    blob_sentiment_analyzer = Blobber(analyzer=NaiveBayesAnalyzer())
    for i,each_text in enumerate(corpus):
        text_blob_features[i,0]=blob_sentiment_analyzer(each_text).sentiment[1]
        text_blob_features[i,1]=blob_sentiment_analyzer(each_text).sentiment[2]
        text_blob_features[i,2]= TextBlob(each_text).subjectivity
        noun_phrase_extractor = TextBlob(each_text, np_extractor=extractor)
        text_blob_features[i,3]= len(noun_phrase_extractor.noun_phrases)
    return text_blob_features

In [None]:
X_train=train_data['reviews.text']
y_train=train_data["sentiment_label"]
X_test=test_data["reviews.text"]
y_test=test_data["sentiment_label"]

In [120]:
Xtr_tb_features = extractTextBlobFeatures(X_train)
Xte_tb_features = extractTextBlobFeatures(X_test)
ytr_tb_features = extractTextBlobFeatures(y_train)
yte_tb_features = extractTextBlobFeatures(y_test)

(5667, 4)


In [123]:
Xtr_tb_features

array([[7.88051110e-01, 2.11948890e-01, 8.75000000e-01, 2.00000000e+00],
       [2.52887668e-01, 7.47112332e-01, 6.00000000e-01, 0.00000000e+00],
       [9.97855794e-01, 2.14420630e-03, 4.88333333e-01, 9.00000000e+00],
       ...,
       [9.96032882e-01, 3.96711790e-03, 6.95833333e-01, 2.00000000e+00],
       [8.79760031e-01, 1.20239969e-01, 5.26666667e-01, 4.00000000e+00],
       [4.52667396e-01, 5.47332604e-01, 5.00000000e-01, 2.00000000e+00]])

In [132]:
# Xtr= np.array(Xtr_tb_features.ravel(),)
# ytr=np.array(ytr_tb_features.ravel(),)

# Xte = np.array(Xte_tb_features.ravel(),)
# yte = np.array(yte_tb_features.ravel(),)

In [134]:
# ## use multinomial NB classifier
# clf_multiNB_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf_nominalNB", MultinomialNB())])
# clf_multiNB_pipe.fit(Xtr, ytr)
# predicted_nb = clf_multiNB_pipe.predict(Xte)
# print("accuracy metrics for training naive bayes classifier:\n",metrics.classification_report(yte, predicted_nb, target_names = ['1','0','-1']))
# print("======================================================================================")

In [125]:
Xte_tb_features

array([[0.24118805, 0.75881195, 0.34821429, 1.        ],
       [0.6738867 , 0.3261133 , 0.75      , 1.        ],
       [0.7739195 , 0.2260805 , 0.42954545, 4.        ],
       ...,
       [0.65462319, 0.34537681, 0.        , 0.        ],
       [0.91263215, 0.08736785, 0.74166667, 2.        ],
       [0.93831843, 0.06168157, 0.625     , 0.        ]])

In [126]:
ytr_tb_features

array([[0.45238095, 0.54761905, 0.54545455, 0.        ],
       [0.45238095, 0.54761905, 0.54545455, 0.        ],
       [0.45238095, 0.54761905, 0.54545455, 0.        ],
       ...,
       [0.45238095, 0.54761905, 0.54545455, 0.        ],
       [0.45238095, 0.54761905, 0.54545455, 0.        ],
       [0.45238095, 0.54761905, 0.54545455, 0.        ]])