# This notebook is about feature engineering with sentiment analysis and classification

#### Load useful libraries and df

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


from tqdm import tqdm

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/janice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/janice/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df_long = pd.read_csv(
    "../data/long/chatgpt_clean_combined_en.csv",
    encoding="utf-8",
)


  df_long = pd.read_csv(


In [3]:
df_long.shape

(64984, 24)

In [4]:
from transformers import pipeline
#import torch

In [5]:
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [6]:
#candidate_labels = ["feature evaluation", "praise", "bug report", "feature request", "performance", "usage"]
candidate_labels = ["positive", "neutral", "negative"]
hypothesis_template = "The sentiment of this review is {}."

In [7]:
#candidate_labels = list(category_map.values())
predictedCategories = []
ScoreNegative = []
ScoreNeutral = []
ScorePositive = []

for i in tqdm(range(len(df_long))): # len(df_long)
    text = df_long.iloc[i,]['sentence']
    res = classifier(text, candidate_labels, hypothesis_template=hypothesis_template, multi_label=True)
    labels = res['labels'] 
    scores = res['scores'] #extracting the scores associated with the labels
    res_dict = {label : score for label,score in zip(labels, scores)}
    sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
    categories  = next(k for i, (k,v) in enumerate(sorted_dict.items()))

    ScorePositive.append(sorted_dict['positive'])
    ScoreNeutral.append(sorted_dict['neutral'])
    ScoreNegative.append(sorted_dict['negative'])
    predictedCategories.append(categories)

100%|██████████| 64984/64984 [11:44:06<00:00,  1.54it/s]  


In [8]:
df_long_zero_prep = pd.DataFrame({  'category': predictedCategories, 
                                    'positive_score': ScorePositive, 
                                    'neutral_score': ScoreNeutral, 
                                    'negative_score': ScoreNegative})
df_long_zero = pd.merge(df_long, df_long_zero_prep, left_index=True, right_index=True)

In [9]:
df_long_zero[['sentence', 'positive_score', 'neutral_score', 'negative_score']]

Unnamed: 0,sentence,positive_score,neutral_score,negative_score
0,the best,0.998887,0.040153,0.000454
1,the best help you cant get you just need to be...,0.731961,0.081856,0.802337
2,can't edit a question,0.097518,0.183290,0.307526
3,can't edit a question like in the browser,0.275136,0.287585,0.597605
4,also i like the complete black background but ...,0.975314,0.191318,0.001431
...,...,...,...,...
64979,first review,0.816886,0.671854,0.675763
64980,first downloaded,0.888082,0.590217,0.632305
64981,usually app,0.846591,0.450016,0.367502
64982,hell yeah,0.998828,0.054262,0.000806


In [10]:
df_long_zero.groupby('category').mean('category')

Unnamed: 0_level_0,Unnamed: 0,score,thumbsUpCount,reply,Reviews,at_q,at_w,positive_score,neutral_score,negative_score
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
negative,7840.81306,2.732202,4.083561,0.035596,1.0,3.001768,32.604962,0.081251,0.325203,0.839485
neutral,7657.566598,3.793545,6.690169,0.017418,1.0,3.021004,32.902152,0.288165,0.623093,0.298541
positive,8911.69792,4.673975,3.496448,0.005775,1.0,3.110933,34.078586,0.927021,0.175289,0.054836


In [11]:
df_long_zero.to_csv("../data/long/ChatGPT-play-reviews_sentiment.csv", index=False)

In [12]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


#Confusion matrix using pandas crosstab
#conf_matrix = confusion_matrix(df_long_zero.score_cat, df_long_zero.category)
#sns.heatmap(conf_matrix, annot=True, cmap=plt.cm.Oranges, fmt='g'); 

## VADER (Valence Aware Dictionary for Sentiment Reasoning)

Good to know:

VADER (Valence Aware Dictionary for Sentiment Reasoning) is a model used for text sentiment analysis that is sensitive to both polarity (positive/negative) and intensity (strength) of emotion. It is available in the NLTK package and can be applied directly to unlabeled text data.

VADER sentimental analysis relies on a dictionary that maps lexical features to emotion intensities known as sentiment scores. The sentiment score of a text can be obtained by summing up the intensity of each word in the text.

For example, Words like ‘love’, ‘enjoy’, ‘happy’, ‘like’ all convey a positive sentiment. Also VADER is intelligent enough to understand the basic context of these words, such as “did not love” as a negative statement. It also understands the emphasis of capitalization and punctuation, such as “ENJOY”

Following code was inspired by the VADER documentation on Git:
https://github.com/cjhutto/vaderSentiment#code-examples

In [17]:
df_long_zero = pd.read_csv("../data/long/ChatGPT-play-reviews_sentiment.csv",
                            encoding="utf-8",
                        )

df_long_zero.shape

  df_long_zero = pd.read_csv("../data/long/ChatGPT-play-reviews_sentiment.csv",


(64984, 28)

In [18]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    #note: depending on how you installed (e.g., using source code download versus pip install), you may need to import like this:
    #from vaderSentiment import SentimentIntensityAnalyzer

In [19]:
from nltk import tokenize

#candidate_labels = list(category_map.values())
vad_cat = []
vad_neg = []
vad_neu = []
vad_pos = []
vad_comp = []

for sentence in tqdm(df_long_zero['sentence']):
     sid = SentimentIntensityAnalyzer()
     ss = sid.polarity_scores(sentence)
     for k in sorted(ss):
         vad_cat.append(max(ss, key=ss.get))
         vad_neg.append(ss['neg'])
         vad_neu.append(ss['neu'])
         vad_pos.append(ss['pos'])
         vad_comp.append(ss['compound'])

100%|██████████| 64984/64984 [09:31<00:00, 113.66it/s]


In [20]:
df_long_vader_prep = pd.DataFrame({ 'vader_cat': vad_cat, 
                                    'vader_neg': vad_neg, 
                                    'vader_neu': vad_neu, 
                                    'vader_pos': vad_pos,
                                    'vader_cmp': vad_comp, })
df_long_vader= pd.merge(df_long_zero, df_long_vader_prep, left_index=True, right_index=True)

In [21]:
df_long_vader.to_csv("../data/long/ChatGPT-play-reviews_sentiment.csv", index=False)

## Sentiment Analysis using TextBlob

In [22]:
from textblob import TextBlob

Polarity determines the sentiment of the text. Its values lie in [-1,1] where -1 denotes a highly negative sentiment and 1 denotes a highly positive sentiment.

Subjectivity determines whether a text input is factual information or a personal opinion. Its value lies between [0,1] where a value closer to 0 denotes a piece of factual information and a value closer to 1 denotes a personal opinion.

In [23]:
#candidate_labels = list(category_map.values())
blob_polarity = []
blob_subjectivity = []

for sentence in tqdm(df_long['sentence']):
     blob_polarity.append(TextBlob(sentence).sentiment.polarity)
     blob_subjectivity.append(TextBlob(sentence).sentiment.subjectivity)

100%|██████████| 64984/64984 [00:14<00:00, 4485.65it/s]


In [24]:
df_long_blob_prep = pd.DataFrame({ 'blob_polarity': blob_polarity, 
                                    'blob_subjectivity': blob_subjectivity})
df_long_blob= pd.merge(df_long_vader, df_long_blob_prep, left_index=True, right_index=True)

In [25]:
df_long_blob.to_csv("../data/long/ChatGPT-play-reviews_sentiment.csv", index=False)

## Sentiment Analysis using Transformer-Based Models

In [26]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [27]:
#candidate_labels = list(category_map.values())
pipe_cat = []
pipe_neg = []
pipe_pos = []

for sentence in tqdm(df_long['sentence']):
     res = sentiment_pipeline(sentence)
     if res[0]['label'] == 'POSITIVE':
          pipe_cat.append('POSITIVE')
          pipe_neg.append(1-res[0]['score'])
          pipe_pos.append(res[0]['score'])
     elif res[0]['label'] == 'NEGATIVE':
          pipe_cat.append('NEGATIVE')
          pipe_neg.append(res[0]['score'])
          pipe_pos.append(1-res[0]['score'])

  0%|          | 0/64984 [00:00<?, ?it/s]

100%|██████████| 64984/64984 [46:16<00:00, 23.40it/s]  


In [54]:
df_long_pipe_prep = pd.DataFrame({ 'pipe_cat': pipe_cat, 
                                    'pipe_neg': pipe_neg, 
                                    'pipe_pos': pipe_pos})

# Load df
df_long_blob = pd.read_csv(
    "../data/long/ChatGPT-play-reviews_sentiment.csv",
    low_memory=False
)

df_long_pipe = pd.merge(df_long_blob, df_long_pipe_prep, left_index=True, right_index=True)

In [55]:
df_long_pipe

Unnamed: 0.1,Unnamed: 0,at,score,isEdited,userName,content,Source,reviewId,thumbsUpCount,replyContent,...,vader_cat,vader_neg,vader_neu,vader_pos,vader_cmp,blob_polarity,blob_subjectivity,pipe_cat,pipe_neg,pipe_pos
0,0,2023-10-30 21:26:19,5,False,Andrea Pepkolaj,the best. the best help you cant get you just ...,Apple,,,,...,pos,0.000,0.192,0.808,0.6369,1.000000,0.300000,POSITIVE,0.000167,0.999833
1,0,2023-10-30 21:26:19,5,False,Andrea Pepkolaj,the best. the best help you cant get you just ...,Apple,,,,...,pos,0.000,0.192,0.808,0.6369,1.000000,0.300000,NEGATIVE,0.985608,0.014392
2,1,2023-05-29 22:57:48,4,False,gjoncari.jurgen,can’t edit a question. can’t edit a question l...,Apple,,,,...,pos,0.000,0.192,0.808,0.6369,0.000000,0.000000,NEGATIVE,0.998232,0.001768
3,1,2023-05-29 22:57:48,4,False,gjoncari.jurgen,can’t edit a question. can’t edit a question l...,Apple,,,,...,pos,0.000,0.192,0.808,0.6369,0.000000,0.000000,NEGATIVE,0.994691,0.005309
4,1,2023-05-29 22:57:48,4,False,gjoncari.jurgen,can’t edit a question. can’t edit a question l...,Apple,,,,...,compound,0.000,0.653,0.347,0.7845,0.031111,0.457778,NEGATIVE,0.994367,0.005633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64979,30912,2023-07-25 17:47:09,5,,Yeamin Hossain (Limon),first review,Google,6bed36cc-bc58-479e-b395-6644d15adeb8,0.0,,...,neu,0.000,0.909,0.091,0.4215,0.250000,0.333333,POSITIVE,0.003007,0.996993
64980,30916,2023-07-25 17:08:33,5,,Sarvesh Soni,first downloader,Google,2ecc7803-920c-4f95-8b15-db9c7b1caa8a,0.0,,...,neu,0.156,0.844,0.000,-0.3415,0.250000,0.333333,POSITIVE,0.110960,0.889040
64981,30918,2023-07-25 17:15:19,5,,mostafijur rahman,usually app,Google,0530373c-1bfc-45d2-9dec-9fc0bb9cff4d,0.0,,...,neu,0.156,0.844,0.000,-0.3415,-0.250000,0.250000,POSITIVE,0.028938,0.971062
64982,30920,2023-07-27 17:53:37,5,,asac schrader,hell yeah,Google,09b3f8d3-cd42-4adf-8e61-ca70865fc853,0.0,,...,neu,0.156,0.844,0.000,-0.3415,0.000000,0.000000,POSITIVE,0.010897,0.989103


In [56]:
df_long_pipe.drop(['Unnamed: 0'], axis='columns', inplace=True, errors='ignore')
df_long_pipe.to_csv("../data/long/ChatGPT-play-reviews_sentiment.csv", index=False)

In [None]:
# df_long_zero = pd.get_dummies(df_long_zero, columns=["category"], dtype=int)

In [None]:

# df_long_zero
# df_long_zero.to_csv("/Users/janice/Documents/Bootcamp/Git/Capstone/capstone_chat-gpt/data/long/ChatGPT-play-reviews_long-zero.csv", index=False)
# df_zero_agg = df_long_zero.groupby(['index', 'score', 'score_cat']).agg( 
#     negative_mean=('negative_score', 'mean'),
#     neutral_mean=('neutral_score', 'mean'),
#     positive_mean=('positive_score', 'mean'),
#     negative_max=('negative_score', 'max'),
#     neutral_max=('neutral_score', 'max'),
#     positive_max=('positive_score', 'max'),
#     negative_sum=('category_negative', 'sum'),
#     neutral_sum=('category_neutral', 'sum'),
#     positive_sum=('category_positive', 'sum')).reset_index()

Aggregating on review level

In [None]:
# df_long_zero = pd.get_dummies(df_long_zero, columns=["category"], dtype=int)
# df_long_zero
# df_long_zero.to_csv("../data/long/ChatGPT-play-reviews_long-zero.csv", index=False)

In [39]:
# Load df
df_long_pipe = pd.read_csv(
    "../data/long/ChatGPT-play-reviews_sentiment.csv",
    low_memory=False
)

In [40]:
df_long_pipe

Unnamed: 0.1,Unnamed: 0,at,score,isEdited,userName,content,Source,reviewId,thumbsUpCount,replyContent,...,positive_score,neutral_score,negative_score,vader_cat,vader_neg,vader_neu,vader_pos,vader_cmp,blob_polarity,blob_subjectivity
0,0,2023-10-30 21:26:19,5,False,Andrea Pepkolaj,the best. the best help you cant get you just ...,Apple,,,,...,0.998887,0.040153,0.000454,pos,0.000,0.192,0.808,0.6369,1.000000,0.300000
1,0,2023-10-30 21:26:19,5,False,Andrea Pepkolaj,the best. the best help you cant get you just ...,Apple,,,,...,0.731961,0.081856,0.802337,pos,0.000,0.192,0.808,0.6369,1.000000,0.300000
2,1,2023-05-29 22:57:48,4,False,gjoncari.jurgen,can’t edit a question. can’t edit a question l...,Apple,,,,...,0.097518,0.183290,0.307526,pos,0.000,0.192,0.808,0.6369,0.000000,0.000000
3,1,2023-05-29 22:57:48,4,False,gjoncari.jurgen,can’t edit a question. can’t edit a question l...,Apple,,,,...,0.275136,0.287585,0.597605,pos,0.000,0.192,0.808,0.6369,0.000000,0.000000
4,1,2023-05-29 22:57:48,4,False,gjoncari.jurgen,can’t edit a question. can’t edit a question l...,Apple,,,,...,0.975314,0.191318,0.001431,compound,0.000,0.653,0.347,0.7845,0.031111,0.457778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64979,30912,2023-07-25 17:47:09,5,,Yeamin Hossain (Limon),first review,Google,6bed36cc-bc58-479e-b395-6644d15adeb8,0.0,,...,0.816886,0.671854,0.675763,neu,0.000,0.909,0.091,0.4215,0.250000,0.333333
64980,30916,2023-07-25 17:08:33,5,,Sarvesh Soni,first downloader,Google,2ecc7803-920c-4f95-8b15-db9c7b1caa8a,0.0,,...,0.888082,0.590217,0.632305,neu,0.156,0.844,0.000,-0.3415,0.250000,0.333333
64981,30918,2023-07-25 17:15:19,5,,mostafijur rahman,usually app,Google,0530373c-1bfc-45d2-9dec-9fc0bb9cff4d,0.0,,...,0.846591,0.450016,0.367502,neu,0.156,0.844,0.000,-0.3415,-0.250000,0.250000
64982,30920,2023-07-27 17:53:37,5,,asac schrader,hell yeah,Google,09b3f8d3-cd42-4adf-8e61-ca70865fc853,0.0,,...,0.998828,0.054262,0.000806,neu,0.156,0.844,0.000,-0.3415,0.000000,0.000000


In [None]:
df_long_pipe.drop(['Unnamed: 0'], axis='columns', errors='ignore')

Unnamed: 0,index,sentence,score,score_cat,detected_language,positive_score,neutral_score,negative_score,category_negative,category_neutral,category_positive,vader_cat,vader_neg,vader_neu,vader_pos,blob_polarity,blob_subjectivity,pipe_cat,pipe_neg,pipe_pos
0,0,chatgpt on android is a solid app with seamles...,4,neutral,en,0.999643,0.394674,0.000585,0,0,1,neu,0.0,0.778,0.222,0.166667,0.233333,POSITIVE,0.000832,0.999168
1,0,however it falls behind its apple counterpart ...,4,neutral,en,0.009318,0.634563,0.971451,1,0,0,neu,0.0,0.778,0.222,-0.400000,0.700000,NEGATIVE,0.785155,0.214845
2,0,the voice input can be prematurely triggered b...,4,neutral,en,0.304566,0.164207,0.708694,1,0,0,neu,0.0,0.778,0.222,0.000000,0.000000,NEGATIVE,0.999622,0.000378
3,0,additionally the lack of a search function for...,4,neutral,en,0.000512,0.088586,0.954502,1,0,0,neu,0.0,0.778,0.222,-0.166667,0.166667,NEGATIVE,0.999488,0.000512
4,0,despite these it remains a commendable app des...,4,neutral,en,0.998531,0.012075,0.001319,0,0,1,neu,0.0,1.000,0.000,0.600000,0.800000,POSITIVE,0.001492,0.998508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34299,30912,first review,5,positive,en,0.816886,0.671854,0.675763,0,0,1,compound,0.0,0.734,0.266,0.250000,0.333333,POSITIVE,0.003007,0.996993
34300,30916,first downloaded,5,positive,en,0.888082,0.590217,0.632305,0,0,1,compound,0.0,0.802,0.198,0.250000,0.333333,POSITIVE,0.110960,0.889040
34301,30918,usually app,5,positive,en,0.846591,0.450016,0.367502,0,0,1,compound,0.0,0.802,0.198,-0.250000,0.250000,POSITIVE,0.028938,0.971062
34302,30921,first comment,5,positive,en,0.815224,0.392924,0.659263,0,0,1,compound,0.0,0.802,0.198,0.250000,0.333333,POSITIVE,0.049670,0.950330


In [None]:
df_long_plus = pd.merge(
    df,
    df_long_pipe.rename(columns={"sentence": "sentence"}).drop(['Unnamed: 0'], axis='columns'),
    right_on='index',
    left_on='Unnamed: 0'
)

In [None]:
df_long_plus[['index','sentence', 'sentence']]

Unnamed: 0,index,sentence,content
0,0,chatgpt on android is a solid app with seamles...,chatgpt on android is a solid app with seamles...
1,0,however it falls behind its apple counterpart ...,chatgpt on android is a solid app with seamles...
2,0,the voice input can be prematurely triggered b...,chatgpt on android is a solid app with seamles...
3,0,additionally the lack of a search function for...,chatgpt on android is a solid app with seamles...
4,0,despite these it remains a commendable app des...,chatgpt on android is a solid app with seamles...
...,...,...,...
34299,30912,first review,first review
34300,30916,first downloaded,first downloaded
34301,30918,usually app,usually app
34302,30921,first comment,first comment


In [None]:
pd.to_datetime(df_long_plus['at']).dt.date.min()

datetime.date(2023, 7, 25)

In [None]:
# df_long_plus.to_csv("../data/long/ChatGPT-play-reviews_sentiment.csv", index=False)

In [None]:
df_long_plus.columns

Index(['Unnamed: 0', 'reviewId', 'userName', 'content', 'score_x',
       'thumbsUpCount', 'at', 'replyContent', 'repliedAt', 'appVersion',
       'at_ymd', 'at_q', 'at_ym', 'at_m', 'at_wd', 'score_cat_x',
       'detected_language_x', 'index', 'sentence', 'score_y', 'score_cat_y',
       'detected_language_y', 'positive_score', 'neutral_score',
       'negative_score', 'category_negative', 'category_neutral',
       'category_positive', 'vader_cat', 'vader_neg', 'vader_neu', 'vader_pos',
       'blob_polarity', 'blob_subjectivity', 'pipe_cat', 'pipe_neg',
       'pipe_pos'],
      dtype='object')