In [None]:
import pandas as pd
import plotly.express as px
import re
import string
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
# Loading dataset
data_path = 'B:/_GITHUB/Data-Science-Projects/Tiktok_Review_analysis/dataset/tiktok_google_play_reviews.csv'
data = pd.read_csv(data_path)

In [None]:
data.head()
# print(data.head().to_markdown(index = False))

In [None]:
print("The dataset contains {} observations and {} features".format(data.shape[0], data.shape[1]))
print("The features are :  {} ".format(', '.join(data.columns)))

The dataset contains, 10 columns with different values some of which we would not use since we
are analyzing the reviews on TikTok, the two crucials tables here would be be _"score"_ and 
_"content"_. Thus we can build a new dataset with those columns.

In [None]:
data = data[['content','score']]
print(data.head(5))

In [None]:
df_groupby_score = data.groupby(['score'])
# df_groupby_score.ngroups
# df_groupby_score.groups
# df_groupby_score.size()
df_groupby_score.describe().head()

This gives us an insight into the __score__ distribution value, we can observe the __count, unique and frequency__ associated with each value.

In [None]:
# check for null value and drop any 
print('Before droping null values: \n',data.isnull().sum())
data = data.dropna()
print('After droping null values: \n',data.isnull().sum())

In [None]:
# Data cleaning
stopword = set(stopwords.words("english"))
stemmer = nltk.SnowballStemmer("english")
# print(stopword)
def preprocess(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text
data['content'] = data['content'].apply(preprocess)
# print(data['content'])

In [None]:
ratings = data['score'].value_counts()
index = ratings.index
fig = px.pie(data, values=ratings, names=index, hole = 0.5)
fig.update_layout(title = 'Ratings distribution', title_x=0.5, height= 500, width = 800)
fig.show()

In [None]:
# checking most frequently used words
text = ' '.join(content for content in data.content)
# print(text)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords = stopwords, background_color = 'black').generate(text)
plt.figure(figsize=(10,15))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

The above picture shows the most frequently used words, we can see that the word *"good", "best" ...* have been used to describe the app.

### **Sentiment analysis**

In the following code we are going to perform sentiment analysis, in order to categorize the review comments into various categories. To achieve this, we are going to use the *__Valence Aware Dictionary for sEntiment Reasoning, or Vader__*, is a NLP algorithm that blended a sentiment lexicon approach as well as grammatical rules and syntactical conventions for expressing sentiment polarity and intensity. Vader is an open-sourced package within the Natural Language Toolkit (NLTK).
*__An example of the Vader is shown in the picture below:__*
![](assets/vader.png)

In [None]:
# Trying to get the good and bad comments sorted
# nltk.download("vader_lexicon")
sentiments = SentimentIntensityAnalyzer()
data["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data["content"]]
data["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data["content"]]
data["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data["content"]]
data = data[['content', 'Positive', 'Negative', 'Neutral']]
data.head()

In [None]:
positive_comments = ' '.join([i for i in data['content'][data['Positive'] > data['Negative']]])
# print(positive_comments)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords = stopwords, background_color = 'white').generate(positive_comments)
plt.figure(figsize=(10,15))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
negative_comments = ' '.join([i for i in data['content'][data['Positive'] < data['Negative']]])
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords = stopwords, background_color = 'white', max_words = 100).generate(negative_comments)
plt.figure(figsize = (10,15))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')

In [None]:
from PIL import Image
import numpy as np

tiktok_mask = np.array(Image.open("./assets/tiktok.jpg"))
# tiktok_mask

In [None]:
def transform_pixel_value(pixel):
    if pixel == 0:
        return 255
    else:
        return pixel

new_tiktok_mask = np.ndarray((tiktok_mask.shape[0], tiktok_mask.shape[1]), np.int32)
# print(new_tiktok_mask)

# for i in range(len(tiktok_mask)):
#     new_tiktok_mask[i] = list(map(transform_pixel_value, tiktok_mask[i]))
    # new_tiktok_mask = map(transform_pixel_value, tiktok_mask[i])

# new_tiktok_mask

In [None]:
wc = WordCloud(stopwords = stopwords, max_words = 100, mask = tiktok_mask, contour_width = 3, contour_color = "black")
wc.generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

### **Conclusion**
In this notebook, we analyzed what __TikTok__ users think of the app. We found good and bad comments and built a __wordcloud__
of comments.  