In [242]:
!pip install spellchecker
!pip install langdetect
!pip install demoji



In [243]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from spellchecker import SpellChecker
from langdetect import detect


from tqdm import tqdm

import nltk
import re
import demoji
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt') 

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/martjebuss/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/martjebuss/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [244]:
# Load df
df = pd.read_csv(
    "../data/ChatGPT-play-reviews.csv",
    encoding="utf-8", parse_dates=["at", "repliedAt"]
)

In [245]:
df = df.drop('reviewCreatedVersion', axis=1)

In [246]:
# generate new date features from at
df['at_ymd'] = df['at'].dt.strftime('%D')
# Create new column for year-quarter
df['at_q'] = df['at'].dt.quarter
# Create new column for year-month
df['at_ym'] = df['at'].dt.strftime('%Y-%m')
# Create new column for month
df['at_m'] = df['at'].dt.strftime('%B')
# Create new column for year-month
df['at_wd'] = df['at'].dt.strftime('%A')

In [221]:
# display number of missing values per column
df.isna().sum()

reviewId             0
userName             1
content              0
score                0
thumbsUpCount        0
at                   0
replyContent     30524
repliedAt        30524
appVersion        4914
at_ymd               0
at_q                 0
at_ym                0
at_m                 0
at_wd                0
dtype: int64

Cleaning Costumer Reviews: 
Remove URLs, emails, phone numbers & punctuations.
Remove tags, emojis, symbols & pictographs.
Remove stop words.
Convert to lowercase and lemmatization.
Duplicates removal.
Spell checking.
Non-English reviews removal.
Remove stop words. 

In [247]:
df["score_cat"] = np.where(df.score == 5, "positive", np.where(df.score == 4, "neutral", "negative"))

### Remove emojis and symbols, standardize mentions of ChatGPT and OpenAI

In [248]:
def pre_process(text):

    # remove emojis
    emoji_pattern = re.compile("["
         u"\U00002700-\U000027BF"  # Dingbats
         u"\U0001F600-\U0001F64F"  # Emoticons
         u"\U00002600-\U000026FF"  # Miscellaneous Symbols
         u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
         u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
         u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
         u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                       "]+", re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, '')
    
    # remove all characters that are not alphanumeric
    #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # remove symbols
    symbol_pattern = re.compile(r'[@#$%^&*()_+{}\[\]"\<>,/\\|`~]+')
    text = symbol_pattern.sub(r'', text)

    # remove - 
    dash_pattern = re.compile(r'-+')
    text = dash_pattern.sub(r'', text)

    #split the string into separate tokens
    tokens = re.split(r"\s+",text)

    # normalise all words into lowercase
    text = " ".join([t.lower() for t in tokens])

    # standardize
    text = text.replace("chat gpt", "chatgpt")
    text = text.replace("open ai", "openai")

    # return final list of tokens
    return text

In [249]:
df['content'] = df['content'].apply(pre_process)

In [250]:
df['content'].tail(90)

30866         yo tengo
30867    thank chatgpt
30868       1st review
30869         just wow
30870     تطبيق احتراف
             ...      
30951             ️️️️
30952             ️️️️
30953             ️️️️
30954                ️
30955                5
Name: content, Length: 90, dtype: object

In [207]:
# test_text = "Amaznig and extremely handy app for many uses.... 🤍 it's like an extension of one's fingers ⭐️⭐️. #ChatGPT Chat GPT OpenAI Open AI HTML Google"

In [215]:
#corrected_text = pre_process(test_text)
#print(test_text)
#print(corrected_text)

Amaznig and extremely handy app for many uses.... 🤍 it's like an extension of one's fingers ⭐️⭐️. #ChatGPT Chat GPT OpenAI Open AI HTML Google
amaznig and extremely handy app for many uses.... it's like an extension of one's fingers ️️. chatgpt chatgpt openai openai html google


In [25]:
# def remove_hashtags(text): 
#     hashtag_pattern = re.compile(r'#\S+')
#     return hashtag_pattern.sub('', text)
# df['content'] = df['content'].apply(remove_hashtags)

### Remove duplicates

In [234]:
count_duplicates = df.duplicated().sum()
count_duplicates

0

### Remove NAs

In [254]:
df.isna().sum()

reviewId             0
userName             1
content              0
score                0
thumbsUpCount        0
at                   0
replyContent     30524
repliedAt        30524
appVersion        4914
at_ymd               0
at_q                 0
at_ym                0
at_m                 0
at_wd                0
score_cat            0
dtype: int64

### Detect language 

In [268]:
def detect_language(comment):
    try:
        return detect(comment)
    except:
        return 'unknown' 

In [269]:
df['detected_language'] = df['content'].apply(detect_language)

In [270]:
df['detected_language'].unique()

array(['en', 'ca', 'it', 'nl', 'fr', 'no', 'ro', 'af', 'da', 'sv', 'hr',
       'so', 'et', 'pt', 'sq', 'tl', 'id', 'hu', 'sw', 'de', 'cy', 'fi',
       'pl', 'es', 'sl', 'cs', 'lv', 'ml', 'unknown', 'sk', 'ta', 'tr',
       'lt', 'vi', 'ar', 'hi', 'fa', 'ur', 'ru', 'bn', 'ne', 'ja', 'uk',
       'el', 'zh-cn', 'ko', 'te', 'gu', 'mr', 'th', 'kn', 'he', 'bg',
       'mk'], dtype=object)

In [279]:
lang = df[df['detected_language'] == "en"]
lang.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21352 entries, 0 to 30923
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   reviewId           21352 non-null  object        
 1   userName           21351 non-null  object        
 2   content            21352 non-null  object        
 3   score              21352 non-null  int64         
 4   thumbsUpCount      21352 non-null  int64         
 5   at                 21352 non-null  datetime64[ns]
 6   replyContent       386 non-null    object        
 7   repliedAt          386 non-null    datetime64[ns]
 8   appVersion         18088 non-null  object        
 9   at_ymd             21352 non-null  object        
 10  at_q               21352 non-null  int32         
 11  at_ym              21352 non-null  object        
 12  at_m               21352 non-null  object        
 13  at_wd              21352 non-null  object        
 14  score_cat  

In [273]:
#df.to_csv("../data/chatgpt_short_clean_all_languages.csv")

### Split into short (review-wise) and long format (sentence-wise)

In [280]:
# Function to split text into sentences
def split_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Split sentences and create a new DataFrame in long format
new_rows = []
for index, row in df.iterrows():
    sentences = split_sentences(row['content'])
    score = row['score']
    score_cat = row['score_cat']
    detected_language = row['detected_language']
    for sentence in sentences:
        new_rows.append({'index': index, 'content': sentence, 'score': score, 'score_cat': score_cat, 'detected_language': detected_language})

df_long = pd.DataFrame(new_rows)

# Print the resulting DataFrame in long format
print(df_long[df_long['index'] == 5])

    index                                            content  score score_cat  \
34      5                                 seems to work now.      3  negative   
35      5                 app seems nice but has two issues.      3  negative   
36      5  the website mentions that users can enable voi...      3  negative   
37      5  also when using voice dictation to compose a p...      3  negative   

   detected_language  
34                en  
35                en  
36                en  
37                en  


### Spell checking

In [281]:
df_en = df[df['detected_language'] == "en"]
df_long_en = df_long[df_long['detected_language'] == "en"]

In [282]:
spell = SpellChecker()
exceptions = ["chatgpt", "openai", "gpt", "html", "css", "javascript"]

In [283]:
def correct_spelling(text): 
    words = text.split()
    corrected_text = []
    for word in words:
        if word in exceptions:
            corrected_text.append(word)
        else: 
            corrected_word = spell.correction(word)
            if corrected_word is not None: 
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
    corrected_text = " ".join(corrected_text)
    return corrected_text

In [199]:
# corrected_text2 = correct_spelling(corrected_text)
# print(test_text)
# print(corrected_text)
# print(corrected_text2)

In [284]:
df_en['content'] = df_en['content'].apply(correct_spelling)
df_long_en['content'] = df_long_en['content'].apply(correct_spelling)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['content'] = df_en['content'].apply(correct_spelling)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_long_en['content'] = df_long_en['content'].apply(correct_spelling)


### Save dataframes

In [None]:
df_en.to_csv("../data/chatgpt_short_clean_en.csv")
df_long_en.to_csv("../data/chatgpt_long_clean_en.csv")