In [42]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


from tqdm import tqdm

import nltk
import re
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/martjebuss/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/martjebuss/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [43]:
# Load df
df = pd.read_csv(
    "../data/ChatGPT-play-reviews.csv",
    encoding="utf-8", parse_dates=["at", "repliedAt"]
)

In [44]:
df = df.drop('reviewCreatedVersion', axis=1)

In [45]:
# generate new date features from at
df['at_ymd'] = df['at'].dt.strftime('%D')
# Create new column for year-quarter
df['at_q'] = df['at'].dt.quarter
# Create new column for year-month
df['at_ym'] = df['at'].dt.strftime('%Y-%m')
# Create new column for month
df['at_m'] = df['at'].dt.strftime('%B')
# Create new column for year-month
df['at_wd'] = df['at'].dt.strftime('%A')

In [33]:
# display number of missing values per column
df.isna().sum()

reviewId             0
userName             1
content              0
score                0
thumbsUpCount        0
at                   0
replyContent     30524
repliedAt        30524
appVersion        4914
at_ymd               0
at_q                 0
at_ym                0
at_m                 0
at_wd                0
dtype: int64

Cleaning Costumer Reviews: 
Remove URLs, emails, phone numbers & punctuations.
Remove tags, emojis, symbols & pictographs.
Remove stop words.
Convert to lowercase and lemmatization.
Duplicates removal.
Spell checking.
Non-English reviews removal.
Remove stop words. 

### Remove URLs, emails, phone numbers, tags

In [23]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

In [34]:
def remove_hashtags(text): 
    hashtag_pattern = re.compile(r'#\S+')
    return hashtag_pattern.sub('', text)

In [37]:
def remove_mentions(text): 
    mentions_pattern = re.compile(r'@\S+')
    return mentions_pattern.sub('', text)

In [51]:
def remove_phonenumber(text):
    phone_pattern = re.compile(r'^[+]*[(]{0,1}[0-9]{1,4}[)]{0,1}[-\s\./0-9]*$')
    return phone_pattern.sub('', text)

In [55]:
def remove_email(text): 
    email_pattern = re.compile(r'^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$')
    return email_pattern.sub('', text)

In [47]:
df_copy = df.copy()

In [57]:
# Apply the remove_urls function to the 'text' column
df['content'] = df['content'].apply(remove_urls)
df['content'] = df['content'].apply(remove_hashtags)
df['content'] = df['content'].apply(remove_mentions)
df['content'] = df['content'].apply(remove_phonenumber)
#df['content'] = df['content'].apply(remove_email) not working

### Remove duplicates

In [135]:
count_duplicates = df.duplicated().sum()
count_duplicates

0

### Spell checking

In [136]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m31m17.9 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

In [147]:
def correct_spelling(text): 
    words = text.split()
    corrected_text = []
    for word in words:
        corrected_word = spell.correction(word)
        if corrected_word is not None:
            corrected_text.append(corrected_word)
        else:
            corrected_text.append(word)
    corrected_text = " ".join(corrected_text)
    return corrected_text

In [148]:
df['content'] = df['content'].apply(correct_spelling)

In [151]:
df.to_csv("../data/chatgpt_after_datacleaning.csv")

In [None]:
# text = "Thes ar som misspleld wordds."

# words = text.split()

# corrected_text = []
# for word in words:
#     # Get the most likely correct spelling
#     corrected_word = spell.correction(word)
#     corrected_text.append(corrected_word)

# # Reconstruct the corrected text
# corrected_text = " ".join(corrected_text)

In [None]:
# corrected_text

'Thes ar som misspelled words'

### Non-English reviews removal

In [69]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m[31m9.7 MB/s[0m eta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993224 sha256=4a15776af3a80ef8981bfa297a04c56032d9856cb458488c546e23be69e9a545
  Stored in directory: /Users/martjebuss/Library/Caches/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [70]:
from langdetect import detect

In [100]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

In [103]:
english_reviews = df[df['content'].apply(is_english)]

In [97]:
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

In [107]:
english_reviews["content"].tail()

30923           Hum First
30926           Wooow....
30927            عالی🔥🔥🔥🔥
30946            Wow.....
30947    Noice 🤓🤓🤓🤓🤓🤓🤓👌🤓👌
Name: content, dtype: object

In [108]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m31m12.5 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [132]:
from textblob import TextBlob

In [133]:
b = TextBlob("bonjour")
b.detect_language()

HTTPError: HTTP Error 400: Bad Request

In [122]:
!pip install spacy
!pip install spacy_langdetect

Collecting spacy_langdetect
  Downloading spacy_langdetect-0.1.2-py3-none-any.whl (5.0 kB)
Collecting pytest (from spacy_langdetect)
  Downloading pytest-7.4.3-py3-none-any.whl.metadata (7.9 kB)
Collecting langdetect==1.0.7 (from spacy_langdetect)
  Downloading langdetect-1.0.7.zip (998 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m998.1/998.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting iniconfig (from pytest->spacy_langdetect)
  Downloading iniconfig-2.0.0-py3-none-any.whl (5.9 kB)
Collecting pluggy<2.0,>=0.12 (from pytest->spacy_langdetect)
  Downloading pluggy-1.3.0-py3-none-any.whl.metadata (4.3 kB)
Downloading pytest-7.4.3-py3-none-any.whl (325 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.1/325.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pluggy-1.3.0-py3-none-any.whl (18 kB)
Building wheels fo

In [129]:
import spacy
from spacy_langdetect import LanguageDetector

In [127]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [130]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy_langdetect.spacy_langdetect.LanguageDetector object at 0x17d00a490> (name: 'language_detector').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

In [115]:
text = "It was a beautiful day ."
lang = TextBlob(text)
lang.detect_language()

HTTPError: HTTP Error 400: Bad Request

In [None]:
for row in df: 
    

In [None]:
# EDA Idea: visualize content of comments with many thumbs up 

In [18]:
df.sort_values("thumbsUpCount", ascending=False).head(20)

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,at,replyContent,repliedAt,appVersion,at_ymd,at_q,at_ym,at_m,at_wd
28,43e85401-d349-4703-bebb-54b1715ef5ed,sanak chandra,I'm very impressed with my experience. Especia...,5,1067,2023-08-02 03:54:56,,NaT,1.0.0023,08/02/23,3,2023-08,August,Wednesday
138,04563d75-3fb9-4ff4-8a41-8bdc1247c78f,Isaiah Rodriguez,NO ADS SO FAR. YES!!! Very good voice recognit...,5,812,2023-07-25 17:58:02,,NaT,1.0.0016,07/25/23,3,2023-07,July,Tuesday
18,82bedff5-5220-45e7-b89a-65054268c85c,Alex Ledford,Easily the best app I've ever downloaded. High...,5,716,2023-09-21 09:33:48,,NaT,1.2023.256,09/21/23,3,2023-09,September,Thursday
66,a4b2d154-0b5c-424d-95cb-3406ee3d8540,Gautam Singh,ChatGPT has been a fantastic addition to my An...,5,700,2023-08-21 08:38:12,,NaT,1.0.0032,08/21/23,3,2023-08,August,Monday
22,fa358c69-36a4-418b-9320-e653774c24f5,Angel Khatiwada,"I've been using the ChatGPT for a while now, a...",5,631,2023-08-26 10:17:02,,NaT,1.0.0035,08/26/23,3,2023-08,August,Saturday
56,e9ff6f49-35fc-419c-82df-a7c2fda044ff,Touheed Shah,Finally it's on play store!!!! Just tried it o...,5,522,2023-07-31 19:53:34,,NaT,1.0.0023,07/31/23,3,2023-07,July,Monday
47,1bc55a8b-d922-4668-bce8-693299a41bf6,Anurag Bagh (Anush),ChatGPT is an incredibly powerful AI language ...,5,514,2023-09-12 20:04:15,,NaT,1.2023.243,09/12/23,3,2023-09,September,Tuesday
39,2cfc3e67-296c-47f5-95c4-e975858360a7,Jakeb Ricks,The ChatGPT Android app is an absolute delight...,5,473,2023-07-26 13:21:40,,NaT,1.0.0022,07/26/23,3,2023-07,July,Wednesday
33,e73bf653-5524-4fd3-8bd1-d309d2147136,Helen Hachtel,ChatGPT App is an absolute game-changer in tex...,5,462,2023-08-19 06:11:28,,NaT,1.0.0032,08/19/23,3,2023-08,August,Saturday
84,1587e389-5eab-40d2-9083-83c82d6ebad0,Muhammad Mudassir Khan,I am thoroughly impressed with ChatGPT! This a...,5,413,2023-08-09 04:45:59,,NaT,1.0.0030,08/09/23,3,2023-08,August,Wednesday
