# This notebook is about feature engineering with sentiment analysis and classification

#### Load useful libraries and df

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


from tqdm import tqdm

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/janice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/janice/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Load df
df = pd.read_csv(
    "../data/chatgpt_short_clean_en.csv",
    encoding="utf-8",
)

df_long = pd.read_csv(
    "../data/chatgpt_long_clean_en.csv",
    encoding="utf-8",
)


In [4]:
nltk.download('punkt')  # Download the punkt tokenizer data

# Function to split text into sentences
def split_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Split sentences and create a new DataFrame in long format
new_rows = []
for index, row in df.iterrows():
    sentences = split_sentences(row['content'])
    score = row['score']
    score_cat = row['score_cat']
    for sentence in sentences:
        new_rows.append({'reviewId': row['reviewId'], 
                         'index': index, 
                         'content': sentence})

df_long = pd.DataFrame(new_rows)

# Print the resulting DataFrame in long format
print(df_long[df_long['index'] == 5])

[nltk_data] Downloading package punkt to /Users/janice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                               reviewId  index  \
6  0fc0b22a-4bc6-4fc0-a590-5fb6948e4d49      5   

                                             content  
6  seems to work now app seems nice but has two i...  


In [20]:
# Looking at a sample of our df
df_long.sample(10)

Unnamed: 0,reviewId,index,content
605,296fc8ab-b672-4341-a9e8-24c0f9d28824,512,the powered assistance is remarkably accurate ...
18454,a6ed8a13-2da3-415a-ab28-cd073b4673da,17220,the website is great i expect this to be great to
2544,9443c9dd-fc19-43f7-a296-d31ac395060f,2155,great and easy some times language auto detect...
16733,d79e0e66-c9eb-44a3-87b1-9fdb70343287,15560,best of all
19318,8cc923e5-7475-48a1-be4c-d766d46e30b9,18068,this is amazing i love it
5288,28d1ced2-29ca-4a14-bd60-60cd93bf8412,4632,initially good has all ans of que with logic .
8115,c0e4fedb-c05e-4997-bb5f-db93e0cb5149,7285,i must appreciate your efforts to make android...
4163,088cdb20-c28e-4b05-92b4-50a83c03835a,3604,great app however i would like to disable that...
9134,c1338028-4b1d-4dfc-a955-460e44bdf8cd,8249,good and so funny and so creative app
16113,4db3a726-fff0-4470-8c4a-325096a5fc57,14956,i love this app


In [6]:
from transformers import pipeline
#import torch

In [7]:
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [8]:
#candidate_labels = ["feature evaluation", "praise", "bug report", "feature request", "performance", "usage"]
candidate_labels = ["positive", "neutral", "negative"]
hypothesis_template = "The sentiment of this review is {}."

In [9]:
#candidate_labels = list(category_map.values())
predictedCategories = []
ScoreNegative = []
ScoreNeutral = []
ScorePositive = []

for i in tqdm(range(len(df_long))): # len(df_long)
    text = df_long.iloc[i,]['content']
    res = classifier(text, candidate_labels, hypothesis_template=hypothesis_template, multi_label=True)
    labels = res['labels'] 
    scores = res['scores'] #extracting the scores associated with the labels
    res_dict = {label : score for label,score in zip(labels, scores)}
    sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
    categories  = next(k for i, (k,v) in enumerate(sorted_dict.items()))

    ScorePositive.append(sorted_dict['positive'])
    ScoreNeutral.append(sorted_dict['neutral'])
    ScoreNegative.append(sorted_dict['negative'])
    predictedCategories.append(categories)

  0%|          | 0/22651 [00:00<?, ?it/s]

100%|██████████| 22651/22651 [19:02:20<00:00,  3.03s/it]    


In [10]:
df_long_zero_prep = pd.DataFrame({  'category': predictedCategories, 
                                    'positive_score': ScorePositive, 
                                    'neutral_score': ScoreNeutral, 
                                    'negative_score': ScoreNegative})
df_long_zero = pd.merge(df_long, df_long_zero_prep, left_index=True, right_index=True)

In [11]:
df_long_zero[['content', 'positive_score', 'neutral_score', 'negative_score']]

Unnamed: 0,content,positive_score,neutral_score,negative_score
0,chatgpt on android is a solid app with seamles...,0.993198,0.068540,0.002369
1,i've been using chatgpt for a while but i've j...,0.994487,0.061399,0.000494
2,the chatgpt android app has completely blown m...,0.999619,0.043285,0.000417
3,no subscription free and accurate unbiased ans...,0.998958,0.952186,0.000676
4,i use this app for learning languages which ch...,0.006705,0.055749,0.153228
...,...,...,...,...
22646,first review,0.816886,0.671854,0.675763
22647,first downloaded,0.888082,0.590217,0.632305
22648,usually app,0.846591,0.450016,0.367502
22649,first comment,0.815224,0.392924,0.659263


In [12]:
df_long_zero

Unnamed: 0,reviewId,index,content,category,positive_score,neutral_score,negative_score
0,36b7f28e-151d-4b98-8a13-41bd017e0d25,0,chatgpt on android is a solid app with seamles...,positive,0.993198,0.068540,0.002369
1,2bc253b6-c804-47e9-b6f2-3a21027efab4,1,i've been using chatgpt for a while but i've j...,positive,0.994487,0.061399,0.000494
2,5f084727-ab85-40b3-bd42-a7a49502fc1f,2,the chatgpt android app has completely blown m...,positive,0.999619,0.043285,0.000417
3,5df90de5-b8e2-4dc2-b6ff-520aa3a25eae,3,no subscription free and accurate unbiased ans...,positive,0.998958,0.952186,0.000676
4,bb66c666-865d-4a31-b27f-4933df3ff829,4,i use this app for learning languages which ch...,negative,0.006705,0.055749,0.153228
...,...,...,...,...,...,...,...
22646,6bed36cc-bc58-479e-b395-6644d15adeb8,21347,first review,positive,0.816886,0.671854,0.675763
22647,2ecc7803-920c-4f95-8b15-db9c7b1caa8a,21348,first downloaded,positive,0.888082,0.590217,0.632305
22648,0530373c-1bfc-45d2-9dec-9fc0bb9cff4d,21349,usually app,positive,0.846591,0.450016,0.367502
22649,4775c835-38dd-48b8-8bf0-c3f38fe8794d,21350,first comment,positive,0.815224,0.392924,0.659263


In [16]:
df_long_zero.groupby('category').mean('category')

Unnamed: 0_level_0,index,positive_score,neutral_score,negative_score
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,9039.328821,0.077446,0.324903,0.863499
neutral,9399.814126,0.304198,0.636833,0.252305
positive,10792.655991,0.957798,0.170101,0.033501


In [17]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


#Confusion matrix using pandas crosstab
#conf_matrix = confusion_matrix(df_long_zero.score_cat, df_long_zero.category)
#sns.heatmap(conf_matrix, annot=True, cmap=plt.cm.Oranges, fmt='g'); 

Aggregating on review level

In [18]:
df_long_zero = pd.get_dummies(df_long_zero, columns=["category"], dtype=int)
df_long_zero
df_long_zero.to_csv("/Users/janice/Documents/Bootcamp/Git/Capstone/capstone_chat-gpt/data/ChatGPT-play-reviews_long-zero.csv")

In [22]:
df_zero_agg = df_long_zero.groupby(['index', 'reviewId']).agg( 
    negative_mean=('negative_score', 'mean'),
    neutral_mean=('neutral_score', 'mean'),
    positive_mean=('positive_score', 'mean'),
    negative_max=('negative_score', 'max'),
    neutral_max=('neutral_score', 'max'),
    positive_max=('positive_score', 'max'),
    negative_sum=('category_negative', 'sum'),
    neutral_sum=('category_neutral', 'sum'),
    positive_sum=('category_positive', 'sum')).reset_index()

In [23]:
df_zero_agg
df_zero_agg.to_csv("/Users/janice/Documents/Bootcamp/Git/Capstone/capstone_chat-gpt/data/ChatGPT-play-reviews_zero.csv", index=False)

## VADER (Valence Aware Dictionary for Sentiment Reasoning)

Good to know:

VADER (Valence Aware Dictionary for Sentiment Reasoning) is a model used for text sentiment analysis that is sensitive to both polarity (positive/negative) and intensity (strength) of emotion. It is available in the NLTK package and can be applied directly to unlabeled text data.

VADER sentimental analysis relies on a dictionary that maps lexical features to emotion intensities known as sentiment scores. The sentiment score of a text can be obtained by summing up the intensity of each word in the text.

For example, Words like ‘love’, ‘enjoy’, ‘happy’, ‘like’ all convey a positive sentiment. Also VADER is intelligent enough to understand the basic context of these words, such as “did not love” as a negative statement. It also understands the emphasis of capitalization and punctuation, such as “ENJOY”

Following code was inspired by the VADER documentation on Git:
https://github.com/cjhutto/vaderSentiment#code-examples

In [24]:
df_long_zero = pd.read_csv("../data/ChatGPT-play-reviews_long-zero.csv",
                            encoding="utf-8",
                        )

df_long_zero.shape

(22651, 10)

In [25]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    #note: depending on how you installed (e.g., using source code download versus pip install), you may need to import like this:
    #from vaderSentiment import SentimentIntensityAnalyzer

In [26]:
from nltk import tokenize

#candidate_labels = list(category_map.values())
vad_cat = []
vad_neg = []
vad_neu = []
vad_pos = []

for sentence in tqdm(df_long_zero['content']):
     sid = SentimentIntensityAnalyzer()
     ss = sid.polarity_scores(sentence)
     for k in sorted(ss):
         vad_cat.append(max(ss, key=ss.get))
         vad_neg.append(ss['neg'])
         vad_neu.append(ss['neu'])
         vad_pos.append(ss['pos'])

100%|██████████| 22651/22651 [02:34<00:00, 146.46it/s]


In [27]:
df_long_vader_prep = pd.DataFrame({ 'vader_cat': vad_cat, 
                                    'vader_neg': vad_neg, 
                                    'vader_neu': vad_neu, 
                                    'vader_pos': vad_pos})
df_long_vader= pd.merge(df_long_zero, df_long_vader_prep, left_index=True, right_index=True)

In [28]:
df_long_vader.to_csv("/Users/janice/Documents/Bootcamp/Git/Capstone/capstone_chat-gpt/data/ChatGPT-play-reviews_long-zero_vader.csv", index=False)

## Sentiment Analysis using TextBlob

In [29]:
from textblob import TextBlob

Polarity determines the sentiment of the text. Its values lie in [-1,1] where -1 denotes a highly negative sentiment and 1 denotes a highly positive sentiment.

Subjectivity determines whether a text input is factual information or a personal opinion. Its value lies between [0,1] where a value closer to 0 denotes a piece of factual information and a value closer to 1 denotes a personal opinion.

In [30]:
#candidate_labels = list(category_map.values())
blob_polarity = []
blob_subjectivity = []

for sentence in tqdm(df_long['content']):
     blob_polarity.append(TextBlob(sentence).sentiment.polarity)
     blob_subjectivity.append(TextBlob(sentence).sentiment.subjectivity)

100%|██████████| 22651/22651 [00:05<00:00, 4311.75it/s]


In [31]:
df_long_blob_prep = pd.DataFrame({ 'blob_polarity': blob_polarity, 
                                    'blob_subjectivity': blob_subjectivity})
df_long_blob= pd.merge(df_long_vader, df_long_blob_prep, left_index=True, right_index=True)

In [32]:
df_long_blob.to_csv("../data/ChatGPT-play-reviews_long-sentiment.csv", index=False)

## Sentiment Analysis using Transformer-Based Models

In [33]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [34]:
#candidate_labels = list(category_map.values())
pipe_cat = []
pipe_neg = []
pipe_pos = []

for sentence in tqdm(df_long['content']):
     res = sentiment_pipeline(sentence)
     if res[0]['label'] == 'POSITIVE':
          pipe_cat.append('POSITIVE')
          pipe_neg.append(1-res[0]['score'])
          pipe_pos.append(res[0]['score'])
     elif res[0]['label'] == 'NEGATIVE':
          pipe_cat.append('NEGATIVE')
          pipe_neg.append(res[0]['score'])
          pipe_pos.append(1-res[0]['score'])

100%|██████████| 22651/22651 [09:42<00:00, 38.88it/s]


In [35]:
df_long_pipe_prep = pd.DataFrame({ 'pipe_cat': pipe_cat, 
                                    'pipe_neg': pipe_neg, 
                                    'pipe_pos': pipe_pos})

# Load df
df_long_blob = pd.read_csv(
    "../data/ChatGPT-play-reviews_long-sentiment.csv"
)

df_long_pipe= pd.merge(df_long_blob, df_long_pipe_prep, left_index=True, right_index=True)

In [36]:
df_long_blob.shape

(22651, 16)

In [37]:
df_long_pipe.to_csv("../data/ChatGPT-play-reviews_long-sentiment.csv", index=False)

In [38]:
# df_long_zero = pd.get_dummies(df_long_zero, columns=["category"], dtype=int)

In [39]:

# df_long_zero
# df_long_zero.to_csv("/Users/janice/Documents/Bootcamp/Git/Capstone/capstone_chat-gpt/data/ChatGPT-play-reviews_long-zero.csv", index=False)
# df_zero_agg = df_long_zero.groupby(['index', 'score', 'score_cat']).agg( 
#     negative_mean=('negative_score', 'mean'),
#     neutral_mean=('neutral_score', 'mean'),
#     positive_mean=('positive_score', 'mean'),
#     negative_max=('negative_score', 'max'),
#     neutral_max=('neutral_score', 'max'),
#     positive_max=('positive_score', 'max'),
#     negative_sum=('category_negative', 'sum'),
#     neutral_sum=('category_neutral', 'sum'),
#     positive_sum=('category_positive', 'sum')).reset_index()

In [40]:
# Load df
df_long_pipe = pd.read_csv(
    "../data/ChatGPT-play-reviews_long-sentiment.csv"
)

In [41]:
df_long_pipe.shape

(22651, 19)

In [43]:
df_long_pipe.rename(columns={"content": "sentence"}).drop(['Unnamed: 0'], axis='columns')

Unnamed: 0,reviewId,index,sentence,positive_score,neutral_score,negative_score,category_negative,category_neutral,category_positive,vader_cat,vader_neg,vader_neu,vader_pos,blob_polarity,blob_subjectivity,pipe_cat,pipe_neg,pipe_pos
0,36b7f28e-151d-4b98-8a13-41bd017e0d25,0,chatgpt on android is a solid app with seamles...,0.993198,0.068540,0.002369,0,0,1,neu,0.030,0.891,0.079,0.069048,0.366667,NEGATIVE,0.745451,0.254549
1,2bc253b6-c804-47e9-b6f2-3a21027efab4,1,i've been using chatgpt for a while but i've j...,0.994487,0.061399,0.000494,0,0,1,neu,0.030,0.891,0.079,0.317063,0.618452,NEGATIVE,0.997383,0.002617
2,5f084727-ab85-40b3-bd42-a7a49502fc1f,2,the chatgpt android app has completely blown m...,0.999619,0.043285,0.000417,0,0,1,neu,0.030,0.891,0.079,0.439583,0.606250,POSITIVE,0.000360,0.999640
3,5df90de5-b8e2-4dc2-b6ff-520aa3a25eae,3,no subscription free and accurate unbiased ans...,0.998958,0.952186,0.000676,0,0,1,neu,0.030,0.891,0.079,0.296528,0.552778,POSITIVE,0.000836,0.999164
4,bb66c666-865d-4a31-b27f-4933df3ff829,4,i use this app for learning languages which ch...,0.006705,0.055749,0.153228,1,0,0,compound,0.012,0.773,0.215,0.330429,0.369643,POSITIVE,0.013996,0.986004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22646,6bed36cc-bc58-479e-b395-6644d15adeb8,21347,first review,0.816886,0.671854,0.675763,0,0,1,compound,0.000,0.571,0.429,0.250000,0.333333,POSITIVE,0.003007,0.996993
22647,2ecc7803-920c-4f95-8b15-db9c7b1caa8a,21348,first downloaded,0.888082,0.590217,0.632305,0,0,1,compound,0.000,0.571,0.429,0.250000,0.333333,POSITIVE,0.110960,0.889040
22648,0530373c-1bfc-45d2-9dec-9fc0bb9cff4d,21349,usually app,0.846591,0.450016,0.367502,0,0,1,neu,0.000,0.707,0.293,-0.250000,0.250000,POSITIVE,0.028938,0.971062
22649,4775c835-38dd-48b8-8bf0-c3f38fe8794d,21350,first comment,0.815224,0.392924,0.659263,0,0,1,neu,0.000,0.707,0.293,0.250000,0.333333,POSITIVE,0.049670,0.950330


In [52]:
df_long_plus = pd.merge(
    df.drop(['Unnamed: 0'], axis='columns'),
    df_long_pipe.rename(columns={"content": "sentence"}).drop(['Unnamed: 0'], axis='columns'),
    on='reviewId',
    left_index=False,
    right_index=False
)

In [54]:
pd.to_datetime(df_long_plus['at']).dt.date.min()

datetime.date(2023, 7, 25)

In [55]:
df_long_plus.to_csv("../data/ChatGPT-play-reviews_long-sentiment_plus.csv", index=False)

In [56]:
df_long_plus.columns

Index(['reviewId', 'userName', 'content', 'score', 'thumbsUpCount', 'at',
       'replyContent', 'repliedAt', 'appVersion', 'at_ymd', 'at_q', 'at_ym',
       'at_m', 'at_wd', 'score_cat', 'detected_language', 'index', 'sentence',
       'positive_score', 'neutral_score', 'negative_score',
       'category_negative', 'category_neutral', 'category_positive',
       'vader_cat', 'vader_neg', 'vader_neu', 'vader_pos', 'blob_polarity',
       'blob_subjectivity', 'pipe_cat', 'pipe_neg', 'pipe_pos'],
      dtype='object')

In [57]:
df_long_plus

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,at,replyContent,repliedAt,appVersion,at_ymd,...,category_positive,vader_cat,vader_neg,vader_neu,vader_pos,blob_polarity,blob_subjectivity,pipe_cat,pipe_neg,pipe_pos
0,36b7f28e-151d-4b98-8a13-41bd017e0d25,Lin Cheng,chatgpt on android is a solid app with seamles...,4,5,2023-10-19 19:26:19,,,1.2023.284,10/19/23,...,1,neu,0.030,0.891,0.079,0.069048,0.366667,NEGATIVE,0.745451,0.254549
1,2bc253b6-c804-47e9-b6f2-3a21027efab4,Alim,i've been using chatgpt for a while but i've j...,5,139,2023-09-29 20:24:38,,,1.2023.263,09/29/23,...,1,neu,0.030,0.891,0.079,0.317063,0.618452,NEGATIVE,0.997383,0.002617
2,5f084727-ab85-40b3-bd42-a7a49502fc1f,Theo Healy,the chatgpt android app has completely blown m...,4,247,2023-07-28 10:29:10,,,1.0.0023,07/28/23,...,1,neu,0.030,0.891,0.079,0.439583,0.606250,POSITIVE,0.000360,0.999640
3,5df90de5-b8e2-4dc2-b6ff-520aa3a25eae,Elliot Limberg,no subscription free and accurate unbiased ans...,5,272,2023-07-30 19:38:37,,,1.0.0023,07/30/23,...,1,neu,0.030,0.891,0.079,0.296528,0.552778,POSITIVE,0.000836,0.999164
4,bb66c666-865d-4a31-b27f-4933df3ff829,Phoebe Moraes,i use this app for learning languages which ch...,4,126,2023-08-09 18:23:33,,,1.0.0030,08/09/23,...,0,compound,0.012,0.773,0.215,0.330429,0.369643,POSITIVE,0.013996,0.986004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22646,6bed36cc-bc58-479e-b395-6644d15adeb8,Yeamin Hossain (Limon),first review,5,0,2023-07-25 17:47:09,,,,07/25/23,...,1,compound,0.000,0.571,0.429,0.250000,0.333333,POSITIVE,0.003007,0.996993
22647,2ecc7803-920c-4f95-8b15-db9c7b1caa8a,Sarvesh Soni,first downloaded,5,0,2023-07-25 17:08:33,,,,07/25/23,...,1,compound,0.000,0.571,0.429,0.250000,0.333333,POSITIVE,0.110960,0.889040
22648,0530373c-1bfc-45d2-9dec-9fc0bb9cff4d,mostafijur rahman,usually app,5,0,2023-07-25 17:15:19,,,,07/25/23,...,1,neu,0.000,0.707,0.293,-0.250000,0.250000,POSITIVE,0.028938,0.971062
22649,4775c835-38dd-48b8-8bf0-c3f38fe8794d,Carter Gledhill,first comment,5,0,2023-07-25 21:05:55,,,,07/25/23,...,1,neu,0.000,0.707,0.293,0.250000,0.333333,POSITIVE,0.049670,0.950330
