<a href="https://colab.research.google.com/github/unt-iialab/INFO5731_Spring2020/blob/master/Interesting_Code/Lesson_nine_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Sentiment analysis for movie reviews**

In [1]:
# Load and prepare the dataset
import nltk
from nltk.corpus import movie_reviews
import random

nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [2]:
# Define the feature extractor

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [3]:
# Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [4]:
# Test the classifier
print(nltk.classify.accuracy(classifier, test_set))

0.85


In [5]:
# Show the most important features as interpreted by Naive Bayes
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.2 : 1.0
         contains(mulan) = True              pos : neg    =      8.4 : 1.0
        contains(seagal) = True              neg : pos    =      7.8 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.5 : 1.0
         contains(damon) = True              pos : neg    =      5.7 : 1.0


# **2. Sentiment Analysis for Twitter Data**

In [19]:
import re 
import tweepy 
from tweepy import OAuthHandler 
from textblob import TextBlob 

class TwitterClient(object): 
    ''' 
    Generic Twitter Class for sentiment analysis. 
    '''
    def __init__(self): 
        ''' 
        Class constructor or initialization method. 
        '''
        # keys and tokens from the Twitter Dev Console 
        consumer_key = 'u7L1lnR7HN85dn1qnTFO1cegb'
        consumer_secret = 'QN1JrEmit2To46ZcwWAT4aI5QGWZXWRDDUPnMCWV5M66SFc8wT'
        access_token = '1144377060036620294-BSEicX3zH7hIhksbNZV9mrWFwa07cO'
        access_token_secret = 'gxWMOodDq1nQAjix9mHEOUSAtgE7XH5ctHInm0XRslJce'

        # attempt authentication 
        try: 
            # create OAuthHandler object 
            self.auth = OAuthHandler(consumer_key, consumer_secret) 
            # set access token and secret 
            self.auth.set_access_token(access_token, access_token_secret) 
            # create tweepy API object to fetch tweets 
            self.api = tweepy.API(self.auth) 
        except: 
            print("Error: Authentication Failed") 

    def clean_tweet(self, tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) 

    def get_tweet_sentiment(self, tweet): 
        ''' 
        Utility function to classify sentiment of passed tweet 
        using textblob's sentiment method 
        '''
        # create TextBlob object of passed tweet text 
        analysis = TextBlob(self.clean_tweet(tweet)) 
        # set sentiment 
        if analysis.sentiment.polarity > 0: 
            return 'positive'
        elif analysis.sentiment.polarity == 0: 
            return 'neutral'
        else: 
            return 'negative'

    def get_tweets(self, query, count = 10): 
        ''' 
        Main function to fetch tweets and parse them. 
        '''
        # empty list to store parsed tweets 
        tweets = [] 

        try: 
            # call twitter api to fetch tweets 
            fetched_tweets = self.api.search(q = query, count = count) 

            # parsing tweets one by one 
            for tweet in fetched_tweets: 
                # empty dictionary to store required params of a tweet 
                parsed_tweet = {} 

                # saving text of tweet 
                parsed_tweet['text'] = tweet.text 
                # saving sentiment of tweet 
                parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text) 

                # appending parsed tweet to tweets list 
                if tweet.retweet_count > 0: 
                    # if tweet has retweets, ensure that it is appended only once 
                    if parsed_tweet not in tweets: 
                        tweets.append(parsed_tweet) 
                else: 
                    tweets.append(parsed_tweet) 

            # return parsed tweets 
            return tweets 

        except AttributeError as e: 
            # print error (if any) 
            print("Error : " + str(e)) 

def main(): 
    # creating object of TwitterClient Class 
    api = TwitterClient() 
    # calling function to get tweets 
    tweets = api.get_tweets(query = 'Donald Trump', count = 200) 
    #print(tweets)
    # picking positive tweets from tweets 
    ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive'] 
    # percentage of positive tweets 
    print("Positive tweets percentage: {} %".format(100*len(ptweets)/len(tweets))) 
    # picking negative tweets from tweets 
    ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative'] 
    # percentage of negative tweets 
    print("Negative tweets percentage: {} %".format(100*len(ntweets)/len(tweets))) 
    # percentage of neutral tweets 
    print("Neutral tweets percentage: {} %".format(100*(len(tweets) - len(ntweets) - len(ptweets))/len(tweets))) 

    # printing first 5 positive tweets 
    print("\n\nPositive tweets:") 
    for tweet in ptweets[:10]: 
        print(tweet['text']) 

    # printing first 5 negative tweets 
    print("\n\nNegative tweets:") 
    for tweet in ntweets[:10]: 
        print(tweet['text']) 

if __name__ == "__main__": 
    # calling main function 
    main() 

Positive tweets percentage: 30.379746835443036 %
Negative tweets percentage: 22.78481012658228 %
Neutral tweets percentage: 46.835443037974684 %


Positive tweets:
RT @mehdirhasan: That Donald Trump feels able to run for president *again* after everything, after *everything*, is a massive indictment of…
RT @kjoerwin: https://t.co/pDgYmQ1CqD

Finally some good news!! 🤦🏻‍♀️
RT @mmpadellan: Many people are saying that Ron DeSantis has bigger crowds than Donald trump.
RT @votevets: From the greatest generation to the latest generation, Veterans will continue to protect our freedom. That includes one of th…
RT @StevenBeschloss: This would be a fine day for Merrick Garland to indict Donald J. Trump.
RT @daddy38456: 60% of Ron DeSantis’ donations have come from donations of $50,000 or more. 54% of Donald Trump’s donations have come from…
RT @ProjectLincoln: Donald Trump is back. We stand ready to help lead the fight against him, partnering with any and all pro-democracy, pat…
RT @OccupyDemocr

In [7]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 5.2 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [8]:
!pip install wordcloud

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **3. Sentiment Analysis for Amazon Review**

In [9]:
# importing all the required Libraries
import glob
import json
import csv
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import string
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings("ignore")

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [13]:
# Data download link:
# https://drive.google.com/drive/folders/0B4Hj2axlpCcxWldiajctWmY0NG8
file=glob.glob('/content/gdrive/My Drive/INFO 5731 TA/Datasets/ReviewSample.json')

In [14]:
file

['/content/gdrive/My Drive/INFO 5731 TA/Datasets/ReviewSample.json']

In [15]:
# Reading a multiple json files from a single json file 'ReviewSample.json'.
review=[]
with open(file[0]) as data_file:
    data=data_file.read()
    for i in data.split('\n'):
        review.append(i)
        
# Making a list of Tuples containg all the data of json files.
reviewDataframe=[]
for x in review:
    try:
        jdata=json.loads(x)
        reviewDataframe.append((jdata['reviewerID'],jdata['asin'],jdata['reviewerName'],jdata['helpful'][0],jdata['helpful'][1],jdata['reviewText'],jdata['overall'],jdata['summary'],jdata['unixReviewTime'],jdata['reviewTime'])) 
    except:
        pass        
    
# Creating a dataframe using the list of Tuples got in the previous step.    
dataset=pd.DataFrame(reviewDataframe,columns=['Reviewer_ID','Asin','Reviewer_Name','helpful_UpVote','Total_Votes','Review_Text','Rating','Summary','Unix_Review_Time','Review_Time'])

In [16]:
# Function to calculate sentiments using Naive Bayes Analyzer

def NaiveBaiyes_Sentimental(sentence):
    blob = TextBlob(sentence, analyzer=NaiveBayesAnalyzer())
    NaiveBayes_SentimentScore=blob.sentiment.classification
    return NaiveBayes_SentimentScore

In [17]:
# Function to calculate sentiments using Vader Sentiment Analyzer

# VADER sentiment analysis tool for getting Compound score.
def sentimental(sentence):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(sentence)
    score=vs['compound']
    return score

# VADER sentiment analysis tool for getting pos, neg and neu.
def sentimental_Score(sentence):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(sentence)
    score=vs['compound']
    if score >= 0.5:
        return 'pos'
    elif (score > -0.5) and (score < 0.5):
        return 'neu'
    elif score <= -0.5:
        return 'neg'

In [18]:
# sentiment calculation by our data as input
Selected_Rows=dataset.head(10)
Selected_Rows['Sentiment_Score']=Selected_Rows['Review_Text'].apply(lambda x: sentimental_Score(x))
pos = Selected_Rows.loc[Selected_Rows['Sentiment_Score'] == 'pos']
print(pos)
neg = Selected_Rows.loc[Selected_Rows['Sentiment_Score'] == 'neg']
print(neg)

      Reviewer_ID        Asin                      Reviewer_Name  \
0  A2XVJBSRI3SWDI  0000031887                            abigail   
1  A2G0LNLN79Q6HR  0000031887                      aj_18 "Aj_18"   
2  A2R3K1KX09QBYP  0000031887                     alert consumer   
3   A19PBP93OF896  0000031887  Alinna Satake "Can't Stop Eating"   
5  A3Q6CTO56DJ8UZ  0000031887                     Amazing Amazon   
6  A1KLRMWW2FWPL4  0000031887        Amazon Customer "cameramom"   
7  A1GQPAM8Y45QN7  0000031887                    Amazon Customer   
8  A2G5TCU2WDFZ65  0000031887                    Amazon Customer   
9   AEAN37KUOYSX4  0000031887                    Amazon Customer   

   helpful_UpVote  Total_Votes  \
0               0            0   
1               1            1   
2               1            1   
3               0            1   
5               3            4   
6               0            0   
7               0            0   
8               0            0   
9            

# **4. Aspect Based sentiment Analysis**

In [38]:
pip install pyabsa==1.9.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyabsa==1.9.3
  Downloading pyabsa-1.9.3-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 5.1 MB/s 
[?25hCollecting torch<1.11.0,>1.0.0
  Downloading torch-1.10.2-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 8.8 MB/s eta 0:00:06tcmalloc: large alloc 1147494400 bytes == 0x65850000 @  0x7f55e0415615 0x58ead6 0x4f355e 0x4d222f 0x51041f 0x5b4ee6 0x58ff2e 0x510325 0x5b4ee6 0x58ff2e 0x50d482 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4bac0a 0x538a76 0x590ae5 0x510280 0x5b4ee6 0x58ff2e 0x50d482 0x5b4ee6 0x58ff2e 0x50c4fc 0x58fd37 0x50ca37 0x5b4ee6 0x58ff2e
[K     |████████████████████████████████| 881.9 MB 1.7 kB/s 
Installing collected packages: torch, pyabsa
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successful

  for line in open(toplevel):
  for line in open(toplevel):


In [1]:
# Find Available Checkpoints For Current Version

from pyabsa import available_checkpoints
checkpoint_map = available_checkpoints()

  return f(*args, **kwds)


No CUDA GPU found in your device


  from collections import Mapping


Remote ABSADataset version: 2022.10.25 Local ABSADatasets version: N.A.
Unknown local version for ABSADatasets, please check the latest version of ABSADatasets at https://github.com/yangheng95/ABSADatasets
Version 1.9.3 of pyabsa is outdated. Version 2.0.2 was released Tuesday November 08, 2022.
check release notes at https://github.com/yangheng95/PyABSA/blob/release/release-note.json


Downloading...
From: https://drive.google.com/uc?id=1CBVGPA3xdQqdkFFwzO5T2Q4reFtzFIJZ
To: /content/checkpoints.json
100%|██████████| 8.17k/8.17k [00:00<00:00, 7.03MB/s]

********** Available APC model checkpoints for Version:1.9.3 (this version) **********
----------------------------------------------------------------------------------------------------
id: https://drive.google.com/file/d/1JIFhaAdoCeZI5CQqfOix3pnrYa6_Mf9S/view?usp=sharing
Training Model: FAST-LSA-S
Training Dataset: English
Language: English
Description: Trained on RTX3090
Available Version: 1.6.3+
Checkpoint File: fast_lsa_s_acc_84.9_f1_82.11.zip
Author: H, Yang (yangheng@m.scnu.edu.cn)
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
id: https://drive.google.com/file/d/1B0RHazOCm2eOWLWExQkeapHr9d3OiZl7/view?usp=sharing
Training Model: FAST-LCF-MDeBERTa
Training Dataset: Chinese
Language: Chinese
Description: Trained on RTX3090
Available Version: 1.8.2+
Checkpoint File: fast_lcf_bert_Chinese_acc_97.11_f1_96.54.zip
Author: H, Yang (y


  checkpoint_map = json.load(open('./checkpoints.json', 'r'))


In [2]:
from pyabsa import ATEPCCheckpointManager

aspect_extractor = ATEPCCheckpointManager.get_aspect_extractor(checkpoint='english',
                                   auto_device=True  # False means load model on CPU
                                   )

Downloading...
From: https://drive.google.com/uc?id=1CBVGPA3xdQqdkFFwzO5T2Q4reFtzFIJZ
To: /content/checkpoints.json
100%|██████████| 8.17k/8.17k [00:00<00:00, 9.32MB/s]


********** Available ATEPC model checkpoints for Version:1.9.3 (this version) **********
----------------------------------------------------------------------------------------------------
id: https://drive.google.com/file/d/1_oBCLi_bjs4CxmEXfVIw8qZCmbJvr-PE/view?usp=sharing
Training Model: FAST-LCFS-ATEPC
Training Dataset: English
Language: English
Description: Trained on RTX3090, this checkpoint use bert-spc in ATEPC training
Available Version: 1.8.4+
Checkpoint File: fast_lcf_atepc_English_cdw_apcacc_85.03_apcf1_82.76_atef1_84.8.zip
Author: H, Yang (yangheng@m.scnu.edu.cn)
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
id: https://drive.google.com/file/d/1wHlEeKbQg51LEgr-J353HQhyPgPDEMrp/view?usp=sharing
Training Model: FAST-LCF-ATEPC
Training Dataset: Chinese
Language: Chinese
Description: Trained on RTX3090 BERT-BASE-CHINESE
Av

Downloading...
From: https://drive.google.com/uc?id=1_oBCLi_bjs4CxmEXfVIw8qZCmbJvr-PE
To: /content/checkpoints/ATEPC_ENGLISH_CHECKPOINT/any_model.zip
100%|██████████| 596M/596M [00:05<00:00, 109MB/s]

Find zipped checkpoint: ./checkpoints/ATEPC_ENGLISH_CHECKPOINT/any_model.zip, unzipping...





Done.
Google Drive applies a restriction on public large file downloading, if you find the checkpoint downloaded is None or small, please download it via browser: 1_oBCLi_bjs4CxmEXfVIw8qZCmbJvr-PE 
Load aspect extractor from ./checkpoints/ATEPC_ENGLISH_CHECKPOINT
config: ./checkpoints/ATEPC_ENGLISH_CHECKPOINT/any_model/fast_lcf_atepc_English_cdw_apcacc_85.03_apcf1_82.76_atef1_84.8/fast_lcf_atepc.config
state_dict: ./checkpoints/ATEPC_ENGLISH_CHECKPOINT/any_model/fast_lcf_atepc_English_cdw_apcacc_85.03_apcf1_82.76_atef1_84.8/fast_lcf_atepc.state_dict
model: None
tokenizer: ./checkpoints/ATEPC_ENGLISH_CHECKPOINT/any_model/fast_lcf_atepc_English_cdw_apcacc_85.03_apcf1_82.76_atef1_84.8/fast_lcf_atepc.tokenizer


  self.opt = pickle.load(open(config_path, mode='rb'))


Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Config used in Training:
ABSADatasetsVersion:2022.03.22	-->	Calling Count:0
IOB_label_to_index:{'B-ASP': 1, 'I-ASP': 2, 'O': 3, '[CLS]': 4, '[SEP]': 5}	-->	Calling Count:1
MV:<metric_visualizer.metric_visualizer.MetricVisualizer object at 0x7f2197239750>	-->	Calling Count:0
PyABSAVersion:1.14.0	-->	Calling Count:0
SRD:3	-->	Calling Count:75166
TorchVersion:1.10.1+cuda11.3	-->	Calling Count:0
TransformersVersion:4.18.0	-->	Calling Count:0
auto_device:True	-->	Calling Count:32901
batch_size:16	-->	Calling Count:5
cache_dataset:False	-->	Calling Count:1
dataset_file:{'train': ['integrated_datasets/atepc_datasets/110.SemEval/113.laptop14/Laptops_Train.xml.seg.atepc', 'integrated_datasets/atepc_datasets/110.SemEval/114.restaurant14/Restaurants_Train.xml.seg.atepc', 'integrated_datasets/atepc_datasets/110.SemEval/116.restaurant16/restaurant_train.raw.atepc', 'integrated_datasets/atepc_datasets/101.ACL_Twitter/acl-14-short-data/train.raw.atepc', 'integrated_datasets/atepc_datasets/109.MAMS/tr

In [3]:
examples = ['Staff was very rude but food was delicious']
inference_source = examples
atepc_result = aspect_extractor.extract_aspect(inference_source=inference_source,  #
                          pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
                          )

  lcf_cdm_vec = torch.tensor([f.lcf_cdm_vec for f in infer_features], dtype=torch.float32)


The results of aspect term extraction have been saved in /content/atepc_inference.result.json
{'sentence': 'staff was very rude but food was delicious', 'IOB': ['B-ASP', 'O', 'O', 'O', 'O', 'B-ASP', 'O', 'O'], 'tokens': ['staff', 'was', 'very', 'rude', 'but', 'food', 'was', 'delicious'], 'aspect': ['staff', 'food'], 'position': [[1], [6]], 'sentiment': ['Negative', 'Positive']}


  json.dump(json.JSONEncoder().encode({'results': results}), open(save_path, 'w'), ensure_ascii=False)


In [4]:
atepc_result

[{'sentence': 'staff was very rude but food was delicious',
  'IOB': ['B-ASP', 'O', 'O', 'O', 'O', 'B-ASP', 'O', 'O'],
  'tokens': ['staff',
   'was',
   'very',
   'rude',
   'but',
   'food',
   'was',
   'delicious'],
  'aspect': ['staff', 'food'],
  'position': [[1], [6]],
  'sentiment': ['Negative', 'Positive']}]

In [5]:
examples = ['Camera quality is very good but battery drains fast']
inference_source = examples
atepc_result = aspect_extractor.extract_aspect(inference_source=inference_source,  #
                          pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
                          )

The results of aspect term extraction have been saved in /content/atepc_inference.result.json
{'sentence': 'camera quality is very good but battery drains fast', 'IOB': ['B-ASP', 'I-ASP', 'O', 'O', 'O', 'O', 'B-ASP', 'O', 'O'], 'tokens': ['camera', 'quality', 'is', 'very', 'good', 'but', 'battery', 'drains', 'fast'], 'aspect': ['camera quality', 'battery'], 'position': [[1, 2], [7]], 'sentiment': ['Positive', 'Negative']}


In [6]:
atepc_result

[{'sentence': 'camera quality is very good but battery drains fast',
  'IOB': ['B-ASP', 'I-ASP', 'O', 'O', 'O', 'O', 'B-ASP', 'O', 'O'],
  'tokens': ['camera',
   'quality',
   'is',
   'very',
   'good',
   'but',
   'battery',
   'drains',
   'fast'],
  'aspect': ['camera quality', 'battery'],
  'position': [[1, 2], [7]],
  'sentiment': ['Positive', 'Negative']}]