In [1]:
# Importing pandas and numpy libraries to work with dataframes
import pandas as pd
import numpy as np
# Importing KMeans cluster model 
# Importing matplotlib for plotting the data
import matplotlib.pyplot as plt

# Importing seaborn for a prittier visulisation
import seaborn as sns

# Setting the seoborn
sns.set()


In [2]:
# Loading the data
data = pd.read_csv('data_with_pos_neg_neu_count_features.csv')

## Feature selection

In [3]:
# Selecting the columns that we will use to run the clustering model

# First data will be used to find clusters based on the review and scores
df_review_type = data[['score', 'thumbsUpCount', 'appName', 'Review type',
          'word_count', 'positive_score', 'neutral_score', 'negative_score',
          'compound_score']]

# The second data will be used to find the fans of super cell apps and non-fans
df_fans = data.copy()


In [4]:
# Now we drop the appId column because we have the appName and that is the same information
# We also drop the count column because we used earlier when grouping
df_fans.drop(['appId', 'count'], axis = 1, inplace = True)
df_fans.head(3)

Unnamed: 0,at,_id,reviewId,userName,content,score,thumbsUpCount,appName,Review type,word_count,positive_score,neutral_score,negative_score,compound_score
0,2021-05-01 20:20:03,60a23abbdb692423c850ebfe,gp:AOqpTOGO6dnp27Rv8vCY2ppHTTw27o2rCkYt1FoqVOd...,Apple Sauce,I used to think of this as a 5star game (and I...,2,0,Boom Beach,Negative,25,0.076,0.924,0.0,0.2023
1,2021-05-01 14:03:39,60a23abbdb692423c850ec05,gp:AOqpTOFNqMfAngyP8SdCxwZosjvNxM7DdErLS4pywK5...,shajeedullah kaisar,I like Boom Beach,5,0,Boom Beach,Positive,4,0.556,0.444,0.0,0.3612
2,2021-05-01 21:02:00,60a23abbdb692423c850ebfc,gp:AOqpTOFP7HzBTwV7MD_yn1vsFkonLge2NCxaKbrOLW1...,ragin wi'll gamin,A verry good game fun and entertaining,5,0,Boom Beach,Positive,7,0.752,0.248,0.0,0.8442


### For the first dataset df review type:

#### I decided to remove the ID, date, and user name from the data that will be used for the clustering model. The ID and user name will not help me because it doesnt have any added value to the model, it will just cause the model to only work with this data set because these unique ID cannot be generalised with other data.

#### I have also removed the content because we already got numerical interpretation of the content and that what we need for this model

#### I have removed the date because it might mislead the model by choosing clusters based on the date and thats not the aim of this dataset. The aim of this dataset is to find the cluster of review types based on the scores and content. This will help us later to identify how positive and how negative the review is.

## KMeans Clustering

In [5]:
# First data will be used to find clusters based on the review and scores
df_review_score = data[['score', 'compound_score']]

### First we create a new column to devine the target for a classification prediction, we will create 8 categories based on the score and content compound score.  The compound score measure how positive is the content, given, +1 is very positive and -1 is very negative:

#### 1 = Super negative,
#### 2 = Very Negative
#### 3 = Negative
#### 4 = Neutral Negative
#### 5 = Neutral Postitive
#### 6 = Postitive
#### 7 = Very Positive 
#### 8 = Super Postive



In [6]:
# First we create a list of our conditions

condetions = [
    # score equal to 1 is Super negative review
    (df_review_score['score'] == 1),
    # score equal to 2, and a compound_score lower or equal to 0 is a very negative review
    (df_review_score['score'] == 2) & (df_review_score['compound_score'] <= 0),
    # score equal to 2, and a compound_score bigger than 0 is a Negative review
    (df_review_score['score'] == 2) & (df_review_score['compound_score'] > 0),
    # score equal to 3, and a compound_score lower or equal to 0 is a Neutral Negative review
    (df_review_score['score'] == 3) & (df_review_score['compound_score'] <= 0),
    # score equal to 3, and a compound_score bigger than 0 is a Neutral positive review
    (df_review_score['score'] == 3) & (df_review_score['compound_score'] > 0),
    # score equal to 4, and a compound_score lower than 0 is a positive review
    (df_review_score['score'] == 4) & (df_review_score['compound_score'] <= 0),
    # score equal to 4, and a compound_score lower than 0 is a very positive review
    (df_review_score['score'] == 4) & (df_review_score['compound_score'] > 0),
    # score equal to 5 is Super positive review
    (df_review_score['score'] == 5)
]

# create a list of the values we want to assign for each condition respectively
categories = ['Super negative', 'Very negative', 'Negative', 'Neutral Negative', 'Neutral positive', 'Positive',
         'Very positive', 'Super positive']

categories_encoded = ['1', '2', '3', '4', '5', '6',
         '7', '8']

df_review_score['review_type'] = np.select(condetions, categories)
df_review_score['review_type_encoded'] = np.select(condetions, categories_encoded)
df_review_score



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_review_score['review_type'] = np.select(condetions, categories)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_review_score['review_type_encoded'] = np.select(condetions, categories_encoded)


Unnamed: 0,score,compound_score,review_type,review_type_encoded
0,2,0.2023,Negative,3
1,5,0.3612,Super positive,8
2,5,0.8442,Super positive,8
3,5,0.4404,Super positive,8
4,5,0.9806,Super positive,8
...,...,...,...,...
1298454,5,-0.4215,Super positive,8
1298455,5,0.4939,Super positive,8
1298456,1,-0.5499,Super negative,1
1298457,5,0.2958,Super positive,8


In [7]:
# Check if there is any missing values
df_review_score.isnull().sum()
# Perfect no missing values

score                  0
compound_score         0
review_type            0
review_type_encoded    0
dtype: int64

In [8]:
#Check the count of review types categories 
df_review_score['review_type'].value_counts()

Super positive      940057
Super negative      148294
Very positive        81952
Neutral positive     39292
Positive             37926
Negative             21163
Neutral Negative     19178
Very negative        10597
Name: review_type, dtype: int64

## Classification models

In [9]:
# Selecting the numeric columns as Machine Learning cannot handle categorical values without encoding
df_review_score_only_numeric = df_review_score[['score', 'compound_score', 'review_type_encoded']]

In [10]:
# Checking the information of the data
df_review_score_only_numeric.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298459 entries, 0 to 1298458
Data columns (total 3 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   score                1298459 non-null  int64  
 1   compound_score       1298459 non-null  float64
 2   review_type_encoded  1298459 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 29.7+ MB


#### We will need to change the data type of the review type encoded to an integer so that it can be recognised as a number

In [11]:
# Changing the data type of review_type_encoded to an integer
df_review_score_only_numeric['review_type_encoded'] = df_review_score_only_numeric['review_type_encoded'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_review_score_only_numeric['review_type_encoded'] = df_review_score_only_numeric['review_type_encoded'].astype(int)


In [12]:
# Checking if the the review_type_encoded changed to an integer
df_review_score_only_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298459 entries, 0 to 1298458
Data columns (total 3 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   score                1298459 non-null  int64  
 1   compound_score       1298459 non-null  float64
 2   review_type_encoded  1298459 non-null  int32  
dtypes: float64(1), int32(1), int64(1)
memory usage: 24.8 MB


#### Now we can start using the data to run a classification model and predict the review type based on the score and content compound score

## Feature Scaling

In [13]:
# importing the StandardScaler to scale the data because we are using Logistics regression
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_scaled_independent = sc.fit_transform(df_review_score_only_numeric[['score', 'compound_score']])

In [14]:
# Checking the scaled dataset
df_scaled_independent

array([[-1.69922293, -0.34913103],
       [ 0.52935767,  0.04662976],
       [ 0.52935767,  1.24960309],
       ...,
       [-2.44208313, -2.22258143],
       [ 0.52935767, -0.11625731],
       [-2.44208313, -0.85298508]])

#### Splitting the data first into a training set and test set

In [15]:
# First we import train_test_split so that we divide the data into a training and test data
from sklearn.model_selection import train_test_split

# Setting the X values as the independent variables and y values as the dependent variable
X = df_scaled_independent[:, :2]
y = df_review_score_only_numeric['review_type_encoded']

# Splitting the data into training and test set
# I will set a side 20% of the data as a test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Creating a validation data set from the traning data
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train, y_train, test_size = 0.2)


In [16]:
# Checking the shape of the the x and y data

# The training data that will be used for model traning
display(X_train_2.shape)
display(y_train_2.shape)

# The validation data that will be used for model evaluation
display(X_val.shape)
display(y_val.shape)


# The test data that will be used to test the model
display(X_test.shape)
display(y_test.shape)

(831013, 2)

(831013,)

(207754, 2)

(207754,)

(259692, 2)

(259692,)

## Samples

In [17]:
# These samples were used to speed up the process of modelling
# It helped in picking the right model, which is in this case KNeighborsClassifier
X_sample = X_train_2[:3000, :]
y_sample = y_train_2[:3000]
X_sample_v = X_val[:3000, :]
y_sample_v = y_val[:3000]


## Training and evaluating the model

In [18]:
from sklearn.neighbors import KNeighborsClassifier

KNeighbors = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 8)

# Fit the classifier to the training data
KNeighbors.fit(X_train_2, y_train_2)

# Predict the labels of the validation set: y_pred
y_pred = KNeighbors.predict(X_val)

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_val, y_pred)
print(cm)
accuracy_score(y_val, y_pred)

[[ 23640      0      0      0      0      0      0      0]
 [     0   1641      0      0      0      0      0      0]
 [     0      1   3410      0      0      0      0      0]
 [     0      0      0   3029      0      0      0      0]
 [     0      0      0      0   6371      0      0      0]
 [     0      0      0      0      0   5987      0      0]
 [     0      0      0      0      0      0  13123      0]
 [     0      0      0      0      0      0      0 150552]]


0.9999951866149388

## Testing the model on the test dataset for more accurate evaluations

In [20]:
y_pred_all = KNeighbors.predict(X_test)

In [21]:
cm = confusion_matrix(y_test, y_pred_all)
print(cm)
accuracy_score(y_test, y_pred_all)

[[ 29675      0      0      0      0      0      0      0]
 [     0   2138      0      0      0      0      0      0]
 [     0      2   4183      0      0      0      0      0]
 [     0      0      0   3686      0      0      0      0]
 [     0      0      0      0   7898      0      0      0]
 [     0      0      0      0      0   7734      0      0]
 [     0      0      0      0      0      0  16328      0]
 [     0      0      0      0      0      0      0 188048]]


0.9999922985690741

## The model is very accurate for the test and validation dataset so there is no need to do hyperparameter tuning or cross validation. However I will use Neural network to complete the requirements of this project

In [None]:
# Checking the shape of the dataset 
df_review_score_only_numeric.shape

In [None]:
# First we import Keras as it will be the library to use for Neural network
import keras

#### First we create an early stop for the model

In [None]:
# Importing EarlyStopping to stop the model from training of the model is not improving
from keras.callbacks import EarlyStopping

# setting up the stop after 3 tries of the model is not improving
# It is important because this will save a lot of time and prevent the model from over fitting
stop = EarlyStopping(monitor = 'val_loss', patience = 3)

In [None]:
# Importing the Sequential and Dense libraries
from keras.models import Sequential
from keras.layers import Dense

# Initiating the model Sequentialy 
model = Sequential()

# Creating the layers, using the activation='relu' and 1, activation='sigmoid' for the last layer
model.add(Dense(12, input_shape= (len(df_review_score_only_numeric.columns)-1,), activation='relu'))
model.add(Dense(30,activation='softmax'))
model.add(Dense(10,activation='softmax'))  
model.add(Dense(5,activation='softmax'))

# Using softmax because we are dealing with a calssification problem
model.add(Dense(1, activation='softmax'))

# Compiling the model and using binary_crossentropy because binary_crossentropy is used for classifications problems
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'] )

In [None]:
model.fit(X_train_2, y_train_2, epochs=10, batch_size=32,
          validation_data= (X_val, y_val), callbacks=stop)