In [1]:
!pip install pyspellchecker
!pip install langdetect
!pip install demoji



In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from spellchecker import SpellChecker
from langdetect import detect


from tqdm import tqdm

import nltk
import re
import demoji
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt') 

[nltk_data] Downloading package wordnet to /Users/janice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/janice/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/janice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df_google = pd.read_csv("../data/ChatGPT-play-reviews.csv", 
                 parse_dates=["at", "repliedAt"])

df_apple = pd.read_csv("../data/all_apple_reviews_janice.csv", parse_dates=["date"])

df_apple["content"] = df_apple["title"] + ". " + df_apple["review"]
df_apple["Source"] = "Apple"

df_google = df_google.drop('reviewCreatedVersion', axis=1)
df_google["Source"] = "Google"

In [4]:
print(df_apple.columns, df_apple.shape)
print(df_google.columns, df_google.shape)

Index(['Unnamed: 0', 'date', 'review', 'rating', 'isEdited', 'userName',
       'title', 'country', 'content', 'Source'],
      dtype='object') (8620, 10)
Index(['reviewId', 'userName', 'content', 'score', 'thumbsUpCount', 'at',
       'replyContent', 'repliedAt', 'appVersion', 'Source'],
      dtype='object') (30956, 10)


In [5]:
df_apple = df_apple.drop(
    columns=['Unnamed: 0', 'isEdited', 'country', 'title', 'review']).rename(
    {'date': 'at', 'review': 'content', 'rating': 'score'}, axis='columns')

In [45]:
df = pd.concat([df_apple, df_google], axis = 0)

In [46]:
# var for review received response
df["reply"] = np.where(df.replyContent.isnull(), 0, 1)
df['score'] = df['score'].astype(int)
df['Reviews'] = 1

In [47]:
# generate new date features from at
df['at_ymd'] = df['at'].dt.strftime('%D')
# Create new column for year-quarter
df['at_q'] = df['at'].dt.quarter
# Create new column for year-month
df['at_ym'] = df['at'].dt.strftime('%Y-%m')
# Create new column for month
df['at_m'] = df['at'].dt.strftime('%B')
# Create new column for year-month
df['at_wd'] = df['at'].dt.strftime('%A')
df['at_w'] = df['at'].dt.isocalendar().week
df['at'] = df['at'].dt.date

In [48]:
# display number of missing values per column
df.isna().sum()

at                   0
score                0
userName             1
content              0
Source               0
reviewId          8620
thumbsUpCount     8620
replyContent     39144
repliedAt        39144
appVersion       13534
reply                0
Reviews              0
at_ymd               0
at_q                 0
at_ym                0
at_m                 0
at_wd                0
at_w                 0
dtype: int64

Cleaning Costumer Reviews: 
Remove URLs, emails, phone numbers & punctuations.
Remove tags, emojis, symbols & pictographs.
Remove stop words.
Convert to lowercase and lemmatization.
Duplicates removal.
Spell checking.
Non-English reviews removal.
Remove stop words. 

In [49]:
df["score_cat"] = np.where(df.score == 5, "positive", np.where(df.score == 4, "neutral", "negative"))

### Remove emojis and symbols, standardize mentions of ChatGPT and OpenAI

In [50]:
def pre_process(text):

    # remove emojis
    emoji_pattern = re.compile("["
         u"\U00002700-\U000027BF"  # Dingbats
         u"\U0001F600-\U0001F64F"  # Emoticons
         u"\U00002600-\U000026FF"  # Miscellaneous Symbols
         u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
         u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
         u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
         u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                       "]+", re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, '')
    
    # remove all characters that are not alphanumeric
    #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # remove symbols
    symbol_pattern = re.compile(r'[@#$%^&*()_+{}\[\]"\<>,/\\|`~]+')
    text = symbol_pattern.sub(r'', text)

    # remove - 
    dash_pattern = re.compile(r'-+')
    text = dash_pattern.sub(r'', text)

    #split the string into separate tokens
    tokens = re.split(r"\s+",text)

    # normalise all words into lowercase
    text = " ".join([t.lower() for t in tokens])

    # standardize
    text = text.replace("chat gpt", "chatgpt")
    text = text.replace("open ai", "openai")

    # return final list of tokens
    return text

In [51]:
df['content'] = df['content'].apply(pre_process)

In [52]:
df['content'].tail(90)

30866         yo tengo
30867    thank chatgpt
30868       1st review
30869         just wow
30870     تطبيق احتراف
             ...      
30951             ️️️️
30952             ️️️️
30953             ️️️️
30954                ️
30955                5
Name: content, Length: 90, dtype: object

In [53]:
# test_text = "Amaznig and extremely handy app for many uses.... 🤍 it's like an extension of one's fingers ⭐️⭐️. #ChatGPT Chat GPT OpenAI Open AI HTML Google"

In [54]:
#corrected_text = pre_process(test_text)
#print(test_text)
#print(corrected_text)

In [55]:
# def remove_hashtags(text): 
#     hashtag_pattern = re.compile(r'#\S+')
#     return hashtag_pattern.sub('', text)
# df['content'] = df['content'].apply(remove_hashtags)

### Remove duplicates

In [56]:
count_duplicates = df.duplicated().sum()
count_duplicates

0

### Remove NAs

In [57]:
df.isna().sum()

at                   0
score                0
userName             1
content              0
Source               0
reviewId          8620
thumbsUpCount     8620
replyContent     39144
repliedAt        39144
appVersion       13534
reply                0
Reviews              0
at_ymd               0
at_q                 0
at_ym                0
at_m                 0
at_wd                0
at_w                 0
score_cat            0
dtype: int64

### Detect language 

In [58]:
def detect_language(comment):
    try:
        return detect(comment)
    except:
        return 'unknown' 

In [59]:
df['detected_language'] = df['content'].apply(detect_language)

In [60]:
df['detected_language'].unique()

array(['en', 'af', 'fr', 'no', 'nl', 'da', 'cy', 'it', 'et', 'tl', 'ro',
       'sv', 'ca', 'hr', 'sw', 'sl', 'so', 'hu', 'pl', 'cs', 'de',
       'unknown', 'id', 'sk', 'fi', 'zh-cn', 'pt', 'es', 'lt', 'ru', 'lv',
       'vi', 'sq', 'ar', 'tr', 'ja', 'ko', 'ur', 'uk', 'fa', 'ml', 'ta',
       'hi', 'bn', 'ne', 'el', 'te', 'gu', 'mr', 'th', 'kn', 'he', 'bg',
       'mk'], dtype=object)

In [61]:
lang = df[df['detected_language'] == "en"]
lang.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27922 entries, 0 to 30923
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   at                 27922 non-null  object        
 1   score              27922 non-null  int64         
 2   userName           27921 non-null  object        
 3   content            27922 non-null  object        
 4   Source             27922 non-null  object        
 5   reviewId           21344 non-null  object        
 6   thumbsUpCount      21344 non-null  float64       
 7   replyContent       388 non-null    object        
 8   repliedAt          388 non-null    datetime64[ns]
 9   appVersion         18092 non-null  object        
 10  reply              27922 non-null  int64         
 11  Reviews            27922 non-null  int64         
 12  at_ymd             27922 non-null  object        
 13  at_q               27922 non-null  int32         
 14  at_ym      

In [62]:
#df.to_csv("../data/chatgpt_short_clean_all_languages.csv")

### Split into short (review-wise) and long format (sentence-wise)

In [63]:
# Function to split text into sentences
def split_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Split sentences and create a new DataFrame in long format
new_rows = []
for index, row in df.iterrows():
    sentences = split_sentences(row['content'])
    score = row['score']
    score_cat = row['score_cat']
    detected_language = row['detected_language']
    for sentence in sentences:
        new_rows.append({'index': index, 'content': sentence, 'score': score, 'score_cat': score_cat, 'detected_language': detected_language})

df_long = pd.DataFrame(new_rows)

# Print the resulting DataFrame in long format
print(df_long[df_long['index'] == 5])

       index                                            content  score  \
75         5                absolute game changer… astounding!.      5   
76         5  once you understand how to input and work with...      5   
77         5     i find myself using it more than i use google.      5   
78         5  it has given me back hours an hour of time tha...      5   
79         5  i can’t even share an effective analogy… maybe...      5   
80         5                      the difference is staggering.      5   
81         5  the only thing i would say is you don’t use ch...      5   
82         5              the thinking is completely different.      5   
83         5  it is a literal conversation that continues wh...      5   
84         5  you start carving away and refining and refini...      5   
85         5  i’m at the point where i can do this in a litt...      5   
86         5  when i’m working with chatgpt it feels like i’...      5   
28588      5                          

### Spell checking

In [64]:
df_en = df[df['detected_language'] == "en"]
df_long_en = df_long[df_long['detected_language'] == "en"]

In [65]:
spell = SpellChecker()
exceptions = ["chatgpt", "openai", "gpt", "html", "css", "javascript", "microsoft"]

In [66]:
def correct_spelling(text): 
    words = text.split()
    corrected_text = []
    for word in words:
        if word in exceptions:
            corrected_text.append(word)
        else: 
            corrected_word = spell.correction(word)
            if corrected_word is not None: 
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
    corrected_text = " ".join(corrected_text)
    return corrected_text

In [67]:
# corrected_text2 = correct_spelling(corrected_text)
# print(test_text)
# print(corrected_text)
# print(corrected_text2)

In [68]:
df_en['content'] = df_en['content'].apply(correct_spelling)
df_long_en['content'] = df_long_en['content'].apply(correct_spelling)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['content'] = df_en['content'].apply(correct_spelling)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_long_en['content'] = df_long_en['content'].apply(correct_spelling)


### Save dataframes

In [69]:
df_en.to_csv("../data/chatgpt_short_clean_combined_en.csv")
df_long_en.to_csv("../data/chatgpt_long_clean_combined_en.csv")