In [198]:
!pip install pyspellchecker
!pip install langdetect
!pip install demoji



In [199]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from spellchecker import SpellChecker
from langdetect import detect


from tqdm import tqdm

import nltk
import re
import demoji
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt') 

[nltk_data] Downloading package wordnet to /Users/janice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/janice/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/janice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [228]:
df_google = pd.read_csv("../data/ChatGPT-play-reviews.csv", 
                 parse_dates=["at", "repliedAt"])

#df_apple_1 = pd.read_csv("../data/all_apple_reviews_2023-11-09.csv", parse_dates=["date"])
df_apple_1 = pd.read_csv("../data/all_apple_reviews_janice.csv", parse_dates=["date"])
df_apple_2 = pd.read_csv("../data/all_apple_reviews_09_11_2023.csv", parse_dates=["date"])

df_apple = pd.concat([df_apple_1, df_apple_2])

df_apple["content"] = df_apple["title"] + ". " + df_apple["review"]
df_apple["Source"] = "Apple"

df_google = df_google.drop('reviewCreatedVersion', axis=1)
df_google["Source"] = "Google"

In [229]:
print(df_apple.columns, df_apple.shape)
print(df_google.columns, df_google.shape)

Index(['Unnamed: 0', 'date', 'review', 'rating', 'isEdited', 'userName',
       'title', 'country', 'content', 'Source'],
      dtype='object') (22698, 10)
Index(['reviewId', 'userName', 'content', 'score', 'thumbsUpCount', 'at',
       'replyContent', 'repliedAt', 'appVersion', 'Source'],
      dtype='object') (30956, 10)


In [230]:
df_apple.groupby('country').count()

Unnamed: 0_level_0,Unnamed: 0,date,review,rating,isEdited,userName,title,content,Source
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AE,111,111,111,111,111,111,111,111,111
AL,8,8,8,8,8,8,8,8,8
AM,2,2,2,2,2,2,2,2,2
AO,14,14,14,14,14,14,14,14,14
AR,84,84,84,84,84,84,84,84,84
...,...,...,...,...,...,...,...,...,...
pk,74,74,74,74,74,74,74,74,74
ru,1,1,1,1,1,1,1,1,1
tr,297,297,297,297,297,297,297,297,297
us,9678,9678,9678,9678,9678,9678,9678,9678,9678


In [231]:
df_apple = df_apple.drop(
    columns=['Unnamed: 0', 'isEdited', 'country', 'title', 'review']).rename(
    {'date': 'at', 'review': 'content', 'rating': 'score'}, axis='columns')

In [232]:
df = pd.concat([df_apple, df_google], axis = 0)

In [233]:
# var for review received response
df["reply"] = np.where(df.replyContent.isnull(), 0, 1)
df['score'] = df['score'].astype(int)
df['Reviews'] = 1
df['Original content'] = df['content']
df["score_cat"] = np.where(df.score == 5, "positive", np.where(df.score == 4, "neutral", "negative"))

In [234]:
# generate new date features from at
df['at_ymd'] = df['at'].dt.strftime('%D')
# Create new column for year-quarter
df['at_q'] = df['at'].dt.quarter
# Create new column for year-month
df['at_ym'] = df['at'].dt.strftime('%Y-%m')
# Create new column for month
df['at_m'] = df['at'].dt.strftime('%B')
# Create new column for year-month
df['at_wd'] = df['at'].dt.strftime('%A')
df['at_w'] = df['at'].dt.isocalendar().week
#df['at'] = df['at'].dt.date

In [235]:
# display number of missing values per column
df.isna().sum()

at                      0
score                   0
userName                1
content                 0
Source                  0
reviewId            22698
thumbsUpCount       22698
replyContent        53222
repliedAt           53222
appVersion          27612
reply                   0
Reviews                 0
Original content        0
at_ymd                  0
at_q                    0
at_ym                   0
at_m                    0
at_wd                   0
at_w                    0
dtype: int64

Cleaning Costumer Reviews: 
Remove URLs, emails, phone numbers & punctuations.
Remove tags, emojis, symbols & pictographs.
Remove stop words.
Convert to lowercase and lemmatization.
Duplicates removal.
Spell checking.
Non-English reviews removal.
Remove stop words. 

### Remove duplicates

In [237]:
count_duplicates = df.duplicated().sum()
count_duplicates

8592

In [None]:
df.drop_duplicates(inplace=True)

### Remove NAs

In [238]:
df.isna().sum()

at                      0
score                   0
userName                1
content                 0
Source                  0
reviewId            22698
thumbsUpCount       22698
replyContent        53222
repliedAt           53222
appVersion          27612
reply                   0
Reviews                 0
Original content        0
at_ymd                  0
at_q                    0
at_ym                   0
at_m                    0
at_wd                   0
at_w                    0
dtype: int64

### Remove emojis and symbols, standardize mentions of ChatGPT and OpenAI

In [212]:
def pre_process(text):

    # remove emojis
    emoji_pattern = re.compile("["
         u"\U00002700-\U000027BF"  # Dingbats
         u"\U0001F600-\U0001F64F"  # Emoticons
         u"\U00002600-\U000026FF"  # Miscellaneous Symbols
         u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
         u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
         u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
         u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                       "]+", re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, '')
    
    # remove all characters that are not alphanumeric
    #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # remove symbols
    symbol_pattern = re.compile(r'[@#$%^&*()_+{}\[\]"\<>,/\\|`~]+')
    text = symbol_pattern.sub(r'', text)

    # remove - 
    dash_pattern = re.compile(r'-+')
    text = dash_pattern.sub(r'', text)

    #split the string into separate tokens
    tokens = re.split(r"\s+",text)

    # normalise all words into lowercase
    text = " ".join([t.lower() for t in tokens])

    # standardize
    text = text.replace("chat gpt", "chatgpt")
    text = text.replace("open ai", "openai")

    # return final list of tokens
    return text

SyntaxError: invalid syntax (4251488716.py, line 1)

In [213]:
tqdm.pandas()
df['content'] = df['content'].progress_apply(pre_process)

100%|██████████| 39576/39576 [00:17<00:00, 2323.65it/s] 


In [214]:
df['content'].tail(90)

30866         yo tengo
30867    thank chatgpt
30868       1st review
30869         just wow
30870     تطبيق احتراف
             ...      
30951             ️️️️
30952             ️️️️
30953             ️️️️
30954                ️
30955                5
Name: content, Length: 90, dtype: object

In [162]:
# test_text = "Amaznig and extremely handy app for many uses.... 🤍 it's like an extension of one's fingers ⭐️⭐️. #ChatGPT Chat GPT OpenAI Open AI HTML Google"

In [163]:
#corrected_text = pre_process(test_text)
#print(test_text)
#print(corrected_text)

In [164]:
# def remove_hashtags(text): 
#     hashtag_pattern = re.compile(r'#\S+')
#     return hashtag_pattern.sub('', text)
# df['content'] = df['content'].apply(remove_hashtags)

### Detect language 

In [217]:
def detect_language(comment):
    try:
        return detect(comment)
    except:
        return 'unknown' 

In [218]:
tqdm.pandas()
df['detected_language'] = df['content'].progress_apply(detect_language)

100%|██████████| 39576/39576 [02:16<00:00, 290.23it/s]


In [219]:
df['detected_language'].unique()

array(['en', 'nl', 'fr', 'no', 'af', 'da', 'cy', 'it', 'tl', 'ro', 'et',
       'sv', 'ca', 'hr', 'sl', 'sw', 'so', 'hu', 'pl', 'de', 'unknown',
       'id', 'sq', 'sk', 'fi', 'zh-cn', 'ko', 'pt', 'es', 'lt', 'ru',
       'lv', 'vi', 'ar', 'tr', 'ja', 'ur', 'uk', 'fa', 'cs', 'ml', 'ta',
       'hi', 'bn', 'ne', 'el', 'te', 'gu', 'mr', 'th', 'kn', 'he', 'bg',
       'mk'], dtype=object)

In [220]:
lang = df[df['detected_language'] == "en"]
lang.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27949 entries, 0 to 30923
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   at                 27949 non-null  object        
 1   score              27949 non-null  int64         
 2   userName           27948 non-null  object        
 3   content            27949 non-null  object        
 4   Source             27949 non-null  object        
 5   reviewId           21370 non-null  object        
 6   thumbsUpCount      21370 non-null  float64       
 7   replyContent       387 non-null    object        
 8   repliedAt          387 non-null    datetime64[ns]
 9   appVersion         18095 non-null  object        
 10  reply              27949 non-null  int64         
 11  Reviews            27949 non-null  int64         
 12  Original content   27949 non-null  object        
 13  at_ymd             27949 non-null  object        
 14  at_q       

In [171]:
#df.to_csv("../data/chatgpt_short_clean_all_languages.csv")

### Split into short (review-wise) and long format (sentence-wise)

In [172]:
df_en = df[df['detected_language'] == "en"]

In [221]:
df_en_long = df_en

# Function to apply nltk.sent_tokenize() to a text column and create a new row for each sentence
def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Apply the function to the 'text_column' and explode the list of rows into individual rows
tqdm.pandas()
df_en_long['sentence'] = df_en_long.content.progress_apply(tokenize_sentences)
df_en_long = df_en_long.explode('sentence')

100%|██████████| 39576/39576 [00:00<00:00, 44548.17it/s]


### Spell checking

In [173]:
spell = SpellChecker()
exceptions = ["chatgpt", "openai", "gpt", "html", "css", "javascript", "microsoft", "elon"]

In [174]:
def correct_spelling(text): 
    words = text.split()
    corrected_text = []
    for word in words:
        if word in exceptions:
            corrected_text.append(word)
        else: 
            corrected_word = spell.correction(word)
            if corrected_word is not None: 
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
    corrected_text = " ".join(corrected_text)
    return corrected_text

In [175]:
# corrected_text2 = correct_spelling(corrected_text)
# print(test_text)
# print(corrected_text)
# print(corrected_text2)

In [176]:
tqdm.pandas()
df_en['content'] = df_en['content'].progress_apply(correct_spelling)
df_en_long['sentence'] = df_en_long['sentence'].progress_apply(correct_spelling)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['content'] = df_en['content'].apply(correct_spelling)


### Save dataframes

In [182]:
df_en.to_csv("../data/chatgpt_short_clean_combined_en.csv")
df_en_long.to_csv("../data/long/chatgpt_clean_combined_en.csv")

In [197]:
df_en_long

Unnamed: 0,at,score,userName,content,Source,reviewId,thumbsUpCount,replyContent,repliedAt,appVersion,...,Original content,at_ymd,at_q,at_ym,at_m,at_wd,at_w,score_cat,detected_language,sentence
0,2023-09-12,5,Tj77883344,love it been using it since last years very much love the app way better than google very realistic once it's fed enough data i use it mostly for psychological questions essentially for advice on emotions situations anything of that sort i've done this for long enough that it is very human like it follows a set of rules i've ingrained in it it's spoken very blunt truths on my topics and what its observations are i won't lie once you've spoken to it enough made it clear what you want and have...,Apple,,,,NaT,,...,"Love it, been using it since last year. Very much love the app. Way better than Google, very realistic once it’s fed enough data. I use it mostly for psychological questions, essentially for advice on emotions, situations, anything of that sort. I’ve done this for long enough that it is very human like. It follows a set of rules I’ve engrained in it. It’s spoken very blunt truths on my topics and what its observations are. I won’t lie, once you’ve spoken to it enough, made it clear what you ...",09/12/23,3,2023-09,September,Tuesday,37,positive,en,love it been using it since last years very much love the app way better than google very realistic once it's fed enough data i use it mostly for psychological questions essentially for advice on emotions situations anything of that sort i've done this for long enough that it is very human like it follows a set of rules i've ingrained in it it's spoken very blunt truths on my topics and what its observations are i won't lie once you've spoken to it enough made it clear what you want and have...
1,2023-09-10,4,PDXJavaJunkie,much more accessible for blind users than the web version up to this point i've mostly been using chatgpt on my windows desktop using google chrome while it's doable screen reader navigation is pretty difficult on the desktop site and you really have to be an advanced user to find your way through it i have submitted numerous feedback to openai about this but nothing has changed on that front well the good news i the is app pretty much addresses all of those problems the i seems really clean...,Apple,,,,NaT,,...,"Much more accessible for blind users than the web version. Up to this point I’ve mostly been using ChatGPT on my windows desktop using Google Chrome. While it’s doable, screen reader navigation is pretty difficult on the desktop site and you really have to be an advanced user to find your way through it. I have submitted numerous feedbacks to open AI about this but nothing has changed on that front. Well, the good news – the iOS app pretty much addresses all of those problems. The UI seems r...",09/10/23,3,2023-09,September,Sunday,36,neutral,en,much more accessible for blind users than the web version up to this point i've mostly been using chatgpt on my windows desktop using google chrome while it's doable screen reader navigation is pretty difficult on the desktop site and you really have to be an advanced user to find your way through it i have submitted numerous feedback to openai about this but nothing has changed on that front well the good news i the is app pretty much addresses all of those problems the i seems really clean...
2,2023-07-11,4,Tinny tower addicted,much anticipated wasn't let down i've been a user since it's initial roll out and have been waiting for a mobile application ever since using the web app for reference i'm a software engineering student while working in it full time i have to say gpt is an crucial tools it takes far less time to get information quickly that you'd otherwise have to source from stackoverflow various redhats articles bunt articles searching through software documentation microsoft documentation eat typically ch...,Apple,,,,NaT,,...,"Much anticipated, wasn’t let down.. I’ve been a user since it’s initial roll out and have been waiting for a mobile application ever since using the web app. For reference I’m a software engineering student while working in IT full time. I have to say GPT is an crucial tool. It takes far less time to get information quickly that you’d otherwise have to source from stack-overflow, various red-hat articles, Ubuntu articles, searching through software documentation, Microsoft documentation ect...",07/11/23,3,2023-07,July,Tuesday,28,neutral,en,much anticipated wasn't let down i've been a user since it's initial roll out and have been waiting for a mobile application ever since using the web app for reference i'm a software engineering student while working in it full time i have to say gpt is an crucial tools it takes far less time to get information quickly that you'd otherwise have to source from stackoverflow various redhats articles bunt articles searching through software documentation microsoft documentation eat typically ch...
3,2023-05-27,4,Would Po,4.5 stars here's why i recently downloaded the app and overall it's a great platform with excellent potential however i did encounter a couple of issues with logging in that i feel need to be addressed firstly the login process was somewhat cumbersome it took me a few attempts to successfully log in as the app didn't always recognize my credentials right away this could be improved by streamlining the login flow and ensuring a smoother user experience secondly the app occasionally experience...,Apple,,,,NaT,,...,"4.5 stars, here’s why. I recently downloaded the app and overall, it's a great platform with excellent potential. However, I did encounter a couple of issues with logging in that I feel need to be addressed. Firstly, the login process was somewhat cumbersome. It took me a few attempts to successfully log in, as the app didn't always recognize my credentials right away. This could be improved by streamlining the login flow and ensuring a smoother user experience. Secondly, the app occasionall...",05/27/23,2,2023-05,May,Saturday,21,neutral,en,4.5 stars here's why i recently downloaded the app and overall it's a great platform with excellent potential however i did encounter a couple of issues with logging in that i feel need to be addressed firstly the login process was somewhat cumbersome it took me a few attempts to successfully log in as the app didn't always recognize my credentials right away this could be improved by streamlining the login flow and ensuring a smoother user experience secondly the app occasionally experience...
4,2023-09-20,5,MORE FREE PAINTINGS!!!,always quick oh my gosh i cannot explain how much i've use this app it's almost like a faster google and more accurate the amount of things this app can do is amazing you can tell her to write a story about anything and it will write that story ask for healthy recipes it'll give it to you even like math problems it is so accurate it's insane like i was sending stories to my friend and she was like oh my gosh how are you doing that how are you doing that and i like chatgpt it's like the best ...,Apple,,,,NaT,,...,"Always quick!. Oh my gosh, I cannot explain how much I’ve use this app. It’s almost like a faster Google and more accurate. The amount of things this app can do is amazing. You can tell her to write a story about anything and it will write that story. Ask for healthy recipes it’ll give it to you even like math problems. It is so accurate it’s insane like I was sending stories to my friend and she was like oh my gosh how are you doing that? How are you doing that and I like ChatGPT it’s like ...",09/20/23,3,2023-09,September,Wednesday,38,positive,en,always quick oh my gosh i cannot explain how much i've use this app it's almost like a faster google and more accurate the amount of things this app can do is amazing you can tell her to write a story about anything and it will write that story ask for healthy recipes it'll give it to you even like math problems it is so accurate it's insane like i was sending stories to my friend and she was like oh my gosh how are you doing that how are you doing that and i like chatgpt it's like the best ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30912,2023-07-25,5,Yeamin Hossain (Limon),first review,Google,6bed36cc-bc58-479e-b395-6644d15adeb8,0.0,,NaT,,...,First Review,07/25/23,3,2023-07,July,Tuesday,30,positive,en,first review
30916,2023-07-25,5,Sarvesh Soni,first downloaded,Google,2ecc7803-920c-4f95-8b15-db9c7b1caa8a,0.0,,NaT,,...,First downloader,07/25/23,3,2023-07,July,Tuesday,30,positive,en,first downloaded
30918,2023-07-25,5,mostafijur rahman,usually app,Google,0530373c-1bfc-45d2-9dec-9fc0bb9cff4d,0.0,,NaT,,...,Usually app,07/25/23,3,2023-07,July,Tuesday,30,positive,en,usually app
30921,2023-07-25,5,Carter Gledhill,first comment,Google,4775c835-38dd-48b8-8bf0-c3f38fe8794d,0.0,,NaT,,...,First comment,07/25/23,3,2023-07,July,Tuesday,30,positive,en,first comment
