## Data preprocessing and Feature Engineering

In [116]:
# Importing the libraries that will be used for this project

import pandas as pd
import numpy as np



In [2]:
# Importing the data for the supervell's apps
data = pd.read_csv('df_all_2020_2021.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# Making a copy of the data to retain the original
df = data.copy()

## Feature Engineering

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298511 entries, 0 to 1298510
Data columns (total 12 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   at                    1298511 non-null  object
 1   _id                   1298511 non-null  object
 2   reviewId              1298511 non-null  object
 3   userName              1298511 non-null  object
 4   content               1298485 non-null  object
 5   score                 1298511 non-null  int64 
 6   thumbsUpCount         1298511 non-null  int64 
 7   reviewCreatedVersion  942686 non-null   object
 8   appName               1298511 non-null  object
 9   appId                 1298511 non-null  object
 10  count                 1298511 non-null  int64 
 11  Review type           1298511 non-null  object
dtypes: int64(3), object(9)
memory usage: 118.9+ MB


In [5]:
# Checking missing values
df.isnull().sum()

at                           0
_id                          0
reviewId                     0
userName                     0
content                     26
score                        0
thumbsUpCount                0
reviewCreatedVersion    355825
appName                      0
appId                        0
count                        0
Review type                  0
dtype: int64

#### Given that we only have 26 missing values on the content I think its safe to drop these values because our data size is over one million rows so 26 will not make that much change in the accuracy of the model

In [6]:
# Here is the code that will let us drop the 26 rows that we will not need because its missing the content
df = df[df['content'].notna()]



In [7]:
# Checking missing values after removing the null values
df.isnull().sum()

at                           0
_id                          0
reviewId                     0
userName                     0
content                      0
score                        0
thumbsUpCount                0
reviewCreatedVersion    355821
appName                      0
appId                        0
count                        0
Review type                  0
dtype: int64

## Feature Engineering

#### Perfect now we can create a new feature of the word length to see if that will have any impact to the model

In [8]:
# Function that returns number of words in a string
def count_words(string):
    # Split the string into words
    words = string.split()
    
    # Return the number of words
    return len(words)
    

In [9]:
# Create a new feature word_count
df['word_count'] = df['content'].apply(count_words)

In [10]:
# Checking the new feature
df.head(3)

Unnamed: 0,at,_id,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,appName,appId,count,Review type,word_count
0,2021-05-01 20:20:03,60a23abbdb692423c850ebfe,gp:AOqpTOGO6dnp27Rv8vCY2ppHTTw27o2rCkYt1FoqVOd...,Apple Sauce,I used to think of this as a 5star game (and I...,2,0,43.87,Boom Beach,Boom Beach,1,Negative,25
1,2021-05-01 14:03:39,60a23abbdb692423c850ec05,gp:AOqpTOFNqMfAngyP8SdCxwZosjvNxM7DdErLS4pywK5...,shajeedullah kaisar,I like Boom Beach,5,0,43.87,Boom Beach,Boom Beach,1,Positive,4
2,2021-05-01 21:02:00,60a23abbdb692423c850ebfc,gp:AOqpTOFP7HzBTwV7MD_yn1vsFkonLge2NCxaKbrOLW1...,ragin wi'll gamin,A verry good game fun and entertaining,5,0,43.87,Boom Beach,Boom Beach,1,Positive,7


Note: you may need to restart the kernel to use updated packages.




## Here I will create a new dataframe that include the sentiment of each content

In [65]:
# Importing and downloading nltk and vader_lexicon, this is a library used for text data
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\k-alw\AppData\Roaming\nltk_data...


True

In [101]:
# Importing the library that will be used to get the content sentiments SentimentIntensityAnalyzer

from nltk.sentiment import SentimentIntensityAnalyzer

# Initiating the model
sia = SentimentIntensityAnalyzer()

# Looping throguh each content to extract the information using SentimentIntensityAnalyzer

# Creating a list of the columns name
columns = list(df['userName'])
scores = []
for i in df['content']:
    scores.append(sia.polarity_scores(i))


[{'neg': 0.0, 'neu': 0.924, 'pos': 0.076, 'compound': 0.2023},
 {'neg': 0.0, 'neu': 0.444, 'pos': 0.556, 'compound': 0.3612},
 {'neg': 0.0, 'neu': 0.248, 'pos': 0.752, 'compound': 0.8442},
 {'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'compound': 0.4404},
 {'neg': 0.0, 'neu': 0.695, 'pos': 0.305, 'compound': 0.9806},
 {'neg': 0.127, 'neu': 0.766, 'pos': 0.107, 'compound': -0.2263},
 {'neg': 0.381, 'neu': 0.159, 'pos': 0.46, 'compound': 0.128},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 0.263, 'pos': 0.737, 'compound': 0.4215},
 {'neg': 0.0, 'neu': 0.213, 'pos': 0.787, 'compound': 0.5719},
 {'neg': 0.247, 'neu': 0.243, 'pos': 0.51, 'compound': 0.4878},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.872},
 {'neg': 0.0, 'neu': 0.601, 'pos': 0.399, 'compound': 0.9725},
 {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404},
 {'neg': 0.0, 

In [73]:
# Creating a dataframe with the new features for each content

word_sentiment = pd.DataFrame(scores)
word_sentiment

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.924,0.076,0.2023
1,0.000,0.444,0.556,0.3612
2,0.000,0.248,0.752,0.8442
3,0.000,0.734,0.266,0.4404
4,0.000,0.695,0.305,0.9806
...,...,...,...,...
1298480,0.113,0.887,0.000,-0.4215
1298481,0.000,0.556,0.444,0.4939
1298482,0.059,0.941,0.000,-0.5499
1298483,0.108,0.743,0.149,0.2958


In [1]:
# Adding the new features to the dataframe

df['positive_score'] = word_sentiment['pos']
df['neutral_score'] = word_sentiment['neu']
df['negative_score'] = word_sentiment['neg']

'''The compound score is the sum of positive, negative & neutral scores which is then normalized between 
   -1(most extreme negative) and +1 (most extreme positive).'''

df['compound_score'] = word_sentiment['compound']

NameError: name 'word_sentiment' is not defined

In [106]:
# Lets check the new dataframe and store the new dataframe into a csv_file
df.isnull().sum()

# Some of the content did not produce any valuable information which means that the content might have been non English
# We can drop the 26 missing values because we will not need them for our model
# We can pick any of these columns and see if they all have the same results


df = df[df['positive_score'].notna()]


In [115]:
# Lets check again
df.isnull().sum()



at                           0
_id                          0
reviewId                     0
userName                     0
content                      0
score                        0
thumbsUpCount                0
reviewCreatedVersion    355817
appName                      0
appId                        0
count                        0
Review type                  0
word_count                   0
positive_score               0
neutral_score                0
negative_score               0
compound_score               0
dtype: int64

#### Perfect they seem to all have had the same exact issue so dropping the null value from any of the 4 new columns will solve the problem

#### Now lets drop the reviewCreatedVersion as we will not use it for my clustring model and it also have a lot of missing values. Though there is a solution by adding the version of the previus column but in this project this is not needed

In [117]:
# Here we drop the column
df.drop(['reviewCreatedVersion'], axis =1, inplace = True)

# Lets check our dataframe again
df.isnull().sum()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


at                0
_id               0
reviewId          0
userName          0
content           0
score             0
thumbsUpCount     0
appName           0
appId             0
count             0
Review type       0
word_count        0
positive_score    0
neutral_score     0
negative_score    0
compound_score    0
dtype: int64

In [118]:
# Lets store the data and start using it for Machine learning 
df.to_csv('data_with_pos_neg_neu_count_features.csv', index = False)

## We start by using KMeans clustering model, we identify 3 cluster first

In [None]:
kmeans_cluster = KMeans(n_clusters = 3)

kmeans_cluster.fit()

In [65]:
df

Unnamed: 0,at,_id,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,appName,appId,count,Review type,word_count,readability_scores
0,2021-05-01 20:20:03,60a23abbdb692423c850ebfe,gp:AOqpTOGO6dnp27Rv8vCY2ppHTTw27o2rCkYt1FoqVOd...,Apple Sauce,0 I used to think of this as a 5star ...,2,0,43.87,Boom Beach,Boom Beach,1,Negative,25,<class 'float'>
1,2021-05-01 14:03:39,60a23abbdb692423c850ec05,gp:AOqpTOFNqMfAngyP8SdCxwZosjvNxM7DdErLS4pywK5...,shajeedullah kaisar,0 I used to think of this as a 5star ...,5,0,43.87,Boom Beach,Boom Beach,1,Positive,4,<class 'float'>
2,2021-05-01 21:02:00,60a23abbdb692423c850ebfc,gp:AOqpTOFP7HzBTwV7MD_yn1vsFkonLge2NCxaKbrOLW1...,ragin wi'll gamin,0 I used to think of this as a 5star ...,5,0,43.87,Boom Beach,Boom Beach,1,Positive,7,<class 'float'>
3,2021-05-01 16:30:45,60a23abbdb692423c850ec03,gp:AOqpTOFS3ogfbxDej800d0Rm6Ec5X3O2fGtKPZnqwgB...,good boyy,0 I used to think of this as a 5star ...,5,0,43.87,Boom Beach,Boom Beach,1,Positive,11,<class 'float'>
4,2021-05-01 12:08:12,60a23abbdb692423c850ec07,gp:AOqpTOGJBFZE3-JAdYdJYImxlnsb6617LY9jztyMDNg...,gaming and nature cod pro,0 I used to think of this as a 5star ...,5,0,43.87,Boom Beach,Boom Beach,1,Positive,73,<class 'float'>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298506,2020-04-01 03:59:56,60a247d19c7cd36fac83aa1c,gp:AOqpTOFtdxXYLWprsqCgzxbXL6u6Ae4_xfL4Yz2D_Mj...,Yaneth Melendez,0 I used to think of this as a 5star ...,1,0,1_46_149,Hay Day,Hay Day,1,Negative,24,<class 'float'>
1298507,2020-04-01 03:49:39,60a247d19c7cd36fac83aa21,gp:AOqpTOHZPdhtMz2QCzaCoWQ6mhvYNjTqiBvixVRtq1_...,Manas Vyas,0 I used to think of this as a 5star ...,3,1,1_46_149,Hay Day,Hay Day,1,Neutral,5,<class 'float'>
1298508,2020-04-01 03:08:15,60a247d19c7cd36fac83aa2e,gp:AOqpTOF9wnH6saeTwKI2QoLLDCSoDZc6qqQXt75tNrv...,Lee Choon Kang,0 I used to think of this as a 5star ...,1,0,,Hay Day,Hay Day,1,Negative,65,<class 'float'>
1298509,2020-04-01 02:30:16,60a247d19c7cd36fac83aa37,gp:AOqpTOHEiITh34Bt-iPWsNSZmnBmsNQpZW0XIsIZfE7...,Rajesh pradhan,0 I used to think of this as a 5star ...,2,0,,Hay Day,Hay Day,1,Negative,73,<class 'float'>
