In [69]:
# !pip install pyspellchecker
# !pip install langdetect
# !pip install demoji

In [70]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from spellchecker import SpellChecker
from langdetect import detect


from tqdm import tqdm

import nltk
import re
import demoji
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt') 

[nltk_data] Downloading package wordnet to /Users/janice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/janice/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/janice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [71]:
df_google = pd.read_csv("../data/ChatGPT-play-reviews.csv", 
                 parse_dates=["at", "repliedAt"])

df_apple_1 = pd.read_csv("../data/all_apple_reviews-1_2023-11-09.csv", parse_dates=["date"])
df_apple_2 = pd.read_csv("../data/all_apple_reviews-2_2023-11-09.csv", parse_dates=["date"])

df_apple = pd.concat([df_apple_1, df_apple_2], join='inner')

df_apple["content"] = df_apple["title"] + ". " + df_apple["review"]
df_apple["Source"] = "Apple"

df_google = df_google.drop('reviewCreatedVersion', axis=1)
df_google["Source"] = "Google"

In [72]:
print(df_apple.columns, df_apple.shape)
print(df_google.columns, df_google.shape)

Index(['country', 'date', 'review', 'rating', 'isEdited', 'userName', 'title',
       'content', 'Source'],
      dtype='object') (28207, 9)
Index(['reviewId', 'userName', 'content', 'score', 'thumbsUpCount', 'at',
       'replyContent', 'repliedAt', 'appVersion', 'Source'],
      dtype='object') (30956, 10)


In [73]:
df_apple.groupby('country').count()

Unnamed: 0_level_0,date,review,rating,isEdited,userName,title,content,Source
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AE,111,111,111,111,111,111,111,111
AL,8,8,8,8,8,8,8,8
AM,2,2,2,2,2,2,2,2
AO,14,14,14,14,14,14,14,14
AR,84,84,84,84,84,84,84,84
...,...,...,...,...,...,...,...,...
us,10931,10931,10931,10931,10931,10931,10931,10931
uy,13,13,13,13,13,13,13,13
vn,40,40,40,40,40,40,40,40
za,94,94,94,94,94,94,94,94


In [74]:
df_apple = df_apple.drop(
    columns=['country', 'title', 'review']).rename(
    {'date': 'at', 'review': 'content', 'rating': 'score'}, axis='columns')

In [75]:
df = pd.concat([df_apple, df_google], axis = 0)

In [76]:
# var for review received response
df["reply"] = np.where(df.replyContent.isnull(), 0, 1)
df['score'] = df['score'].astype(int)
df['Reviews'] = 1
df['Original content'] = df['content']
df["score_cat"] = np.where(df.score == 5, "positive", np.where(df.score == 4, "neutral", "negative"))

In [77]:
# generate new date features from at
df['at_ymd'] = df['at'].dt.strftime('%D')
# Create new column for year-quarter
df['at_q'] = df['at'].dt.quarter
# Create new column for year-month
df['at_ym'] = df['at'].dt.strftime('%Y-%m')
# Create new column for month
df['at_m'] = df['at'].dt.strftime('%B')
# Create new column for year-month
df['at_wd'] = df['at'].dt.strftime('%A')
df['at_w'] = df['at'].dt.isocalendar().week
#df['at'] = df['at'].dt.date

In [78]:
# display number of missing values per column
df.isna().sum()

at                      0
score                   0
isEdited            30956
userName                1
content                 0
Source                  0
reviewId            28207
thumbsUpCount       28207
replyContent        58731
repliedAt           58731
appVersion          33121
reply                   0
Reviews                 0
Original content        0
score_cat               0
at_ymd                  0
at_q                    0
at_ym                   0
at_m                    0
at_wd                   0
at_w                    0
dtype: int64

Cleaning Costumer Reviews: 
Remove URLs, emails, phone numbers & punctuations.
Remove tags, emojis, symbols & pictographs.
Remove stop words.
Convert to lowercase and lemmatization.
Duplicates removal.
Spell checking.
Non-English reviews removal.
Remove stop words. 

### Remove duplicates

In [79]:
df.shape

(59163, 21)

In [80]:
count_duplicates = df.duplicated().sum()
count_duplicates

14075

In [81]:
df.sort_values('at', ascending=False).drop_duplicates(inplace=True, keep='first')
df.shape

(59163, 21)

In [82]:
count_duplicates = df.duplicated(subset=['content', 'userName']).sum()
count_duplicates

14078

In [83]:
df[df.duplicated(keep=False) == True].sort_values('content')

Unnamed: 0,at,score,isEdited,userName,content,Source,reviewId,thumbsUpCount,replyContent,repliedAt,...,reply,Reviews,Original content,score_cat,at_ymd,at_q,at_ym,at_m,at_wd,at_w
7851,2023-09-27 08:53:58,5,False,w.valera,"!!. самый лучший чат очень помогает, вы самые лучшие и очень круто что чат бесплатный, спасибо вам большое !!!",Apple,,,,NaT,...,0,1,"!!. самый лучший чат очень помогает, вы самые лучшие и очень круто что чат бесплатный, спасибо вам большое !!!",positive,09/27/23,3,2023-09,September,Wednesday,39
13275,2023-09-27 08:53:58,5,False,w.valera,"!!. самый лучший чат очень помогает, вы самые лучшие и очень круто что чат бесплатный, спасибо вам большое !!!",Apple,,,,NaT,...,0,1,"!!. самый лучший чат очень помогает, вы самые лучшие и очень круто что чат бесплатный, спасибо вам большое !!!",positive,09/27/23,3,2023-09,September,Wednesday,39
11748,2023-10-14 07:35:35,1,False,BrunnenKai,"!$&# Update. App used to work wonderfully. No issues at all. Now, after yesterdays update to “fix bugs and crashes”the app no longer works. Great job, guys.",Apple,,,,NaT,...,0,1,"!$&# Update. App used to work wonderfully. No issues at all. Now, after yesterdays update to “fix bugs and crashes”the app no longer works. Great job, guys.",negative,10/14/23,4,2023-10,October,Saturday,41
3115,2023-10-14 07:35:35,1,False,BrunnenKai,"!$&# Update. App used to work wonderfully. No issues at all. Now, after yesterdays update to “fix bugs and crashes”the app no longer works. Great job, guys.",Apple,,,,NaT,...,0,1,"!$&# Update. App used to work wonderfully. No issues at all. Now, after yesterdays update to “fix bugs and crashes”the app no longer works. Great job, guys.",negative,10/14/23,4,2023-10,October,Saturday,41
13525,2023-09-24 22:43:38,5,False,Sumaiia27.,!. Одно из лучших и полезных приложений!,Apple,,,,NaT,...,0,1,!. Одно из лучших и полезных приложений!,positive,09/24/23,3,2023-09,September,Sunday,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,2023-09-11 03:53:12,5,False,bellyr.brazl3421,🫶🏻. O App é incrível é Mt bom,Apple,,,,NaT,...,0,1,🫶🏻. O App é incrível é Mt bom,positive,09/11/23,3,2023-09,September,Monday,37
12518,2023-10-05 17:10:01,5,False,htxcNo no no no no nana nana,🫶🏻. 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻,Apple,,,,NaT,...,0,1,🫶🏻. 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻,positive,10/05/23,4,2023-10,October,Thursday,40
7089,2023-10-05 17:10:01,5,False,htxcNo no no no no nana nana,🫶🏻. 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻,Apple,,,,NaT,...,0,1,🫶🏻. 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻 🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻🫶🏻,positive,10/05/23,4,2023-10,October,Thursday,40
5511,2023-10-24 16:45:00,5,False,charlimidiosagriega,"🫶🏽. me ayuda en TODO , gracias chatGPT 💓🙏🏻",Apple,,,,NaT,...,0,1,"🫶🏽. me ayuda en TODO , gracias chatGPT 💓🙏🏻",positive,10/24/23,4,2023-10,October,Tuesday,43


In [84]:
df.drop_duplicates(subset=['content', 'userName'],inplace=True)
df.shape

(45085, 21)

### Remove NAs

In [85]:
df.isna().sum()

at                      0
score                   0
isEdited            30954
userName                1
content                 0
Source                  0
reviewId            14131
thumbsUpCount       14131
replyContent        44653
repliedAt           44653
appVersion          19045
reply                   0
Reviews                 0
Original content        0
score_cat               0
at_ymd                  0
at_q                    0
at_ym                   0
at_m                    0
at_wd                   0
at_w                    0
dtype: int64

### Remove emojis and symbols, standardize mentions of ChatGPT and OpenAI

In [86]:
def pre_process(text):

    # remove emojis
    emoji_pattern = re.compile("["
         u"\U00002700-\U000027BF"  # Dingbats
         u"\U0001F600-\U0001F64F"  # Emoticons
         u"\U00002600-\U000026FF"  # Miscellaneous Symbols
         u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
         u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
         u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
         u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                       "]+", re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, '')
    
    # remove all characters that are not alphanumeric
    #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # remove symbols
    symbol_pattern = re.compile(r'[@#$%^&*()_+{}\[\]"\<>,/\\|`~]+')
    text = symbol_pattern.sub(r'', text)

    # remove - 
    dash_pattern = re.compile(r'-+')
    text = dash_pattern.sub(r'', text)

    #split the string into separate tokens
    tokens = re.split(r"\s+",text)

    # normalise all words into lowercase
    text = " ".join([t.lower() for t in tokens])

    # standardize
    text = text.replace("chat gpt", "chatgpt")
    text = text.replace("chatgbt", "chatgpt")
    text = text.replace("cgatgpt", "chatgpt")
    text = text.replace("open ai", "openai")
    text = text.replace('1st', 'first')
    text = text.replace(',but', ', but')

    # return final list of tokens
    return text

In [87]:
tqdm.pandas()
df['content'] = df['content'].progress_apply(pre_process)

  0%|          | 0/45085 [00:00<?, ?it/s]

100%|██████████| 45085/45085 [00:20<00:00, 2228.50it/s] 


In [88]:
df['content'].tail(90)

30866         yo tengo
30867    thank chatgpt
30868     first review
30869         just wow
30870     تطبيق احتراف
             ...      
30951             ️️️️
30952             ️️️️
30953             ️️️️
30954                ️
30955                5
Name: content, Length: 90, dtype: object

In [89]:
# test_text = "Amaznig and extremely handy app for many uses.... 🤍 it's like an extension of one's fingers ⭐️⭐️. #ChatGPT Chat GPT OpenAI Open AI HTML Google"

In [90]:
#corrected_text = pre_process(test_text)
#print(test_text)
#print(corrected_text)

In [91]:
# def remove_hashtags(text): 
#     hashtag_pattern = re.compile(r'#\S+')
#     return hashtag_pattern.sub('', text)
# df['content'] = df['content'].apply(remove_hashtags)

### Detect language 

In [92]:
def detect_language(comment):
    try:
        return detect(comment)
    except:
        return 'unknown' 

In [93]:
tqdm.pandas()
df['detected_language'] = df['content'].progress_apply(detect_language)

100%|██████████| 45085/45085 [02:33<00:00, 293.57it/s]


In [94]:
df['detected_language'].unique()

array(['en', 'fr', 'pt', 'es', 'so', 'unknown', 'cy', 'sw', 'it', 'cs',
       'tr', 'ru', 'sq', 'sv', 'nl', 'tl', 'fi', 'zh-cn', 'af', 'de',
       'no', 'id', 'et', 'sl', 'ca', 'da', 'ro', 'lt', 'sk', 'lv', 'hu',
       'bg', 'vi', 'ar', 'hr', 'pl', 'th', 'ur', 'uk', 'fa', 'el', 'he',
       'ja', 'ko', 'mr', 'zh-tw', 'bn', 'ml', 'ta', 'hi', 'ne', 'te',
       'gu', 'kn', 'mk'], dtype=object)

In [95]:
lang = df[df['detected_language'] == "en"]
lang.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30313 entries, 0 to 30923
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   at                 30313 non-null  datetime64[ns]
 1   score              30313 non-null  int64         
 2   isEdited           8943 non-null   object        
 3   userName           30312 non-null  object        
 4   content            30313 non-null  object        
 5   Source             30313 non-null  object        
 6   reviewId           21370 non-null  object        
 7   thumbsUpCount      21370 non-null  float64       
 8   replyContent       389 non-null    object        
 9   repliedAt          389 non-null    datetime64[ns]
 10  appVersion         18085 non-null  object        
 11  reply              30313 non-null  int64         
 12  Reviews            30313 non-null  int64         
 13  Original content   30313 non-null  object        
 14  score_cat  

In [96]:
#df.to_csv("../data/chatgpt_short_clean_all_languages.csv")

### Split into short (review-wise) and long format (sentence-wise)

In [97]:
df_en = df[df['detected_language'] == "en"]

In [98]:
df_en_long = df_en.copy()

# Function to apply nltk.sent_tokenize() to a text column and create a new row for each sentence
def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Apply the function to the 'text_column' and explode the list of rows into individual rows
tqdm.pandas()
df_en_long['sentence'] = df_en_long.content.progress_apply(tokenize_sentences)
df_en_long = df_en_long.explode('sentence')

  0%|          | 0/30313 [00:00<?, ?it/s]

100%|██████████| 30313/30313 [00:01<00:00, 22324.87it/s]


In [99]:
df_en_long.sort_values('Source').sort_index(ascending=False).head(100)

Unnamed: 0,at,score,isEdited,userName,content,Source,reviewId,thumbsUpCount,replyContent,repliedAt,...,Original content,score_cat,at_ymd,at_q,at_ym,at_m,at_wd,at_w,detected_language,sentence
30923,2023-07-25 17:09:58,5,,Rohan Dafare,hum first,Google,beb0b506-732d-4d42-888b-1485a3958ab2,0.0,,NaT,...,Hum First,positive,07/25/23,3,2023-07,July,Tuesday,30,en,hum first
30921,2023-07-25 21:05:55,5,,Carter Gledhill,first comment,Google,4775c835-38dd-48b8-8bf0-c3f38fe8794d,0.0,,NaT,...,First comment,positive,07/25/23,3,2023-07,July,Tuesday,30,en,first comment
30918,2023-07-25 17:15:19,5,,mostafijur rahman,usually app,Google,0530373c-1bfc-45d2-9dec-9fc0bb9cff4d,0.0,,NaT,...,Usually app,positive,07/25/23,3,2023-07,July,Tuesday,30,en,usually app
30916,2023-07-25 17:08:33,5,,Sarvesh Soni,first downloader,Google,2ecc7803-920c-4f95-8b15-db9c7b1caa8a,0.0,,NaT,...,First downloader,positive,07/25/23,3,2023-07,July,Tuesday,30,en,first downloader
30914,2023-07-25 17:08:53,5,,Fayeem UL HQ,first review,Google,22ec6550-9fb4-4803-a49b-b6e2f7671db9,0.0,,NaT,...,1st review,positive,07/25/23,3,2023-07,July,Tuesday,30,en,first review
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30558,2023-07-25 17:09:57,5,,Md Musa Pk,first install,Google,aa0913e2-702d-479b-bceb-42b8580599a5,0.0,,NaT,...,1st install🇧🇩🇧🇩🇧🇩💕🥰🥰,positive,07/25/23,3,2023-07,July,Tuesday,30,en,first install
30557,2023-07-25 21:07:50,5,,Sourav Singh Gurkha,owsome interface,Google,5f3bb37a-c055-4016-a74f-a6a7c914ed3b,0.0,,NaT,...,Owsome interface 😍,positive,07/25/23,3,2023-07,July,Tuesday,30,en,owsome interface
30554,2023-07-25 17:08:38,5,,Ishtiyaq Abbasi,finally downloded,Google,d9caeba5-718b-40aa-a276-82be82c072df,0.0,,NaT,...,Finally downloded 😅,positive,07/25/23,3,2023-07,July,Tuesday,30,en,finally downloded
30553,2023-07-31 19:45:11,5,,Nika Khurtsidze,not first?,Google,2cf1a995-5c91-415d-9ef9-10561e4de82e,0.0,,NaT,...,not first?,positive,07/31/23,3,2023-07,July,Monday,31,en,not first?


In [100]:
df_en_long.shape

(64992, 23)

### Spell checking

In [101]:
spell = SpellChecker()
exceptions = ["chatgpt", "chatgbt", "openai", "gpt", "html", "css", "javascript", "microsoft", "elon"]

In [102]:
def correct_spelling(text): 
    words = text.split()
    corrected_text = []
    for word in words:
        if word in exceptions:
            corrected_text.append(word)
        else: 
            corrected_word = spell.correction(word)
            if corrected_word is not None: 
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
    corrected_text = " ".join(corrected_text)
    return corrected_text

In [103]:
# corrected_text2 = correct_spelling(corrected_text)
# print(test_text)
# print(corrected_text)
# print(corrected_text2)

In [112]:
tqdm.pandas()
#df_en['content'] = df_en['content'].progress_apply(correct_spelling)
tqdm.pandas()
df_en_long['sentence'] = df_en_long['sentence'].progress_apply(correct_spelling)

100%|██████████| 64992/64992 [55:36<00:00, 19.48it/s]   


### Save dataframes

In [113]:
df_en.to_csv("../data/chatgpt_short_clean_combined_en.csv", index=False)
df_en_long.to_csv("../data/long/chatgpt_clean_combined_en.csv", index=False)

In [110]:
pd.options.display.max_colwidth = 500
df_en[['content','Original content']][df_en['content'].str.contains('catgut') == True]

Unnamed: 0,content,Original content
211,absolutely fantastic apply have been using the chatgpt android app for a while now and i am thoroughly impressed the apps interface is userfriendly and intuitive making it incredibly easy to have natural conversations with the air the responses are remarkably accurate and relevant showcasing the impressive language capabilities of the model whether i'm seeking information engaging in creative writing or simply having a friendly catgut one thing missed main language pakistani uru .,"Absolutely Fantastic App!I have been using the ChatGPT Android app for a while now, and I am thoroughly impressed. The app's interface is user-friendly and intuitive, making it incredibly easy to have natural conversations with the AI. The responses are remarkably accurate and relevant, showcasing the impressive language capabilities of the model. Whether I'm seeking information, engaging in creative writing, or simply having a friendly chat,but one thing missed main language pakistani urdu ."
833,well it's useful but there is a problem i asked it a question related to schrödinger's catgut he missed the most important thing that if we observe the cate would require light.even a single photon will excite the electron of the radioactive atom and ultimately it's decay rate increases,"Well, it's useful but there is a problem. I asked it a question related to Schrödinger's cat,but he missed the most important thing that if we observe the cat,we would require light.Even a single photon will excite the electron of the radioactive atom and ultimately, it's decay rate increases."
