# Mobile Legends : Bang Bang Sentiment Analysis

## Import libraries

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

## Read the Scrapped Data

In [31]:
mlbb_reviews = pd.read_csv('mlbb_reviews.csv')
mlbb_reviews.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,4a2755ef-8989-4908-97fe-ef8bc8a7bdc1,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Bad matching system for ranked games. Both sid...,1,2170,1.8.93.9702,2024-08-31 11:13:31,,,1.8.93.9702
1,a9d7d176-323e-4231-a74a-e572986b9019,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I'm really disappointed with Mobile Legends' p...,1,242,1.9.47.10372,2025-02-26 21:52:54,,,1.9.47.10372
2,afc81f0f-ba6f-4cde-b458-185ad8c1d8ac,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,If you want a MOBA game that is device friendl...,5,6064,1.9.48.10373,2025-03-03 21:01:22,,,1.9.48.10373
3,5dbeda4b-93d1-47e2-943a-f2401d78debd,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I really love playing Brawl mode! I think it h...,4,489,1.9.47.10372,2025-02-14 17:09:57,,,1.9.47.10372
4,eec29d50-e45c-4815-ba6f-8e5b5bb6ddde,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Honestly the best moba game for mobile! the op...,5,700,1.9.48.10373,2025-03-17 12:21:10,,,1.9.48.10373


## Describe The Data

In [32]:
mlbb_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              72000 non-null  object
 1   userName              72000 non-null  object
 2   userImage             72000 non-null  object
 3   content               72000 non-null  object
 4   score                 72000 non-null  int64 
 5   thumbsUpCount         72000 non-null  int64 
 6   reviewCreatedVersion  57603 non-null  object
 7   at                    72000 non-null  object
 8   replyContent          1951 non-null   object
 9   repliedAt             1951 non-null   object
 10  appVersion            57603 non-null  object
dtypes: int64(2), object(9)
memory usage: 6.0+ MB


## Remove Empty Values

In [33]:
mlbb_reviews = mlbb_reviews.drop(columns=['replyContent', 'repliedAt'])
mlbb_reviews = mlbb_reviews.dropna()
mlbb_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57603 entries, 0 to 71999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              57603 non-null  object
 1   userName              57603 non-null  object
 2   userImage             57603 non-null  object
 3   content               57603 non-null  object
 4   score                 57603 non-null  int64 
 5   thumbsUpCount         57603 non-null  int64 
 6   reviewCreatedVersion  57603 non-null  object
 7   at                    57603 non-null  object
 8   appVersion            57603 non-null  object
dtypes: int64(2), object(7)
memory usage: 4.4+ MB


## Text Preproccessing 

In [34]:
import re 
import string 
import nltk  # Import pustaka NLTK (Natural Language Toolkit).

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt', force=True) 
nltk.download('punkt_tab')
nltk.download('stopwords') 

def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
 
    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text
 
def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text
 
def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text
 
def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text):
    ps = PorterStemmer()
    words = text.split()
    stemmed_words = [ps.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text
    
    

 
def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwilimz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/felixwilimz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwilimz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
mlbb_reviews['text_clean'] = mlbb_reviews['content'].apply(cleaningText)

mlbb_reviews['text_casefoldingText'] = mlbb_reviews['text_clean'].apply(casefoldingText)

mlbb_reviews['text_tokenizingText'] = mlbb_reviews['text_casefoldingText'].apply(tokenizingText)

mlbb_reviews['text_stopword'] = mlbb_reviews['text_tokenizingText'].apply(filteringText)

mlbb_reviews['final_text'] = mlbb_reviews['text_stopword'].apply(toSentence)

## Get Sentiments

In [36]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0 else 0 
    return sentiment 


mlbb_reviews['sentiment'] = mlbb_reviews['final_text'].apply(get_sentiment)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/felixwilimz/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Train and Evaluate Data 

In [39]:
X = mlbb_reviews['final_text']
y = mlbb_reviews['sentiment']

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8, stop_words=None)
X_tfidf = tfidf.fit_transform(X)

features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

features_df

Unnamed: 0,account,afk,almost,already,also,always,annoying,another,anymore,app,...,wifi,win,without,wont,work,worse,worst,would,years,youre
0,0.0,0.000000,0.000000,0.0,0.000000,0.501784,0.0,0.26674,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.000000,0.229215,0.0,0.157526,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.000000,0.000000,0.0,0.154682,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.191829,0.000000,0.0,0.219878,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.147898,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.153381,0.0,0.0
4,0.0,0.000000,0.000000,0.0,0.221435,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57598,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.295263,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
57599,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.196263,0.0,0.0
57600,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
57601,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0


Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score, classification_report, confusion_matrix

naive_bayes = BernoulliNB()

naive_bayes.fit(X_train.toarray(), y_train)

y_pred_train_nb = naive_bayes.predict(X_train.toarray())
y_pred_test_nb = naive_bayes.predict(X_test.toarray())

accuracy_train_nb = naive_bayes.score(X_train.toarray(), y_train)
accuracy_test_nb = naive_bayes.score(X_test.toarray(), y_test)

print(f"Train Accuracy: {accuracy_train_nb}")
print(f"Test Accuracy: {accuracy_test_nb}")

print(f"F1 Score: {f1_score(y_test, y_pred_test_nb)}")
print(classification_report(y_test, y_pred_test_nb))


Train Accuracy: 0.9381971268608134
Test Accuracy: 0.940543355611492
F1 Score: 0.968220830433774
              precision    recall  f1-score   support

           0       0.48      0.62      0.54       648
           1       0.98      0.96      0.97     10873

    accuracy                           0.94     11521
   macro avg       0.73      0.79      0.75     11521
weighted avg       0.95      0.94      0.94     11521



Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

y_pred_train_lr = log_reg.predict(X_train)
y_pred_test_lr = log_reg.predict(X_test)

accuracy_train_lr = log_reg.score(X_train, y_train)
accuracy_test_lr = log_reg.score(X_test, y_test)

print(f"Train Accuracy: {accuracy_train_lr}")
print(f"Test Accuracy: {accuracy_test_lr}")

print(f"F1 Score: {f1_score(y_test, y_pred_test_lr)}")
print(classification_report(y_test, y_pred_test_lr))

Train Accuracy: 0.9491124517165054
Test Accuracy: 0.9477475913549172
F1 Score: 0.9729486833827626
              precision    recall  f1-score   support

           0       0.66      0.14      0.24       648
           1       0.95      1.00      0.97     10873

    accuracy                           0.95     11521
   macro avg       0.81      0.57      0.60     11521
weighted avg       0.94      0.95      0.93     11521



Deep Learning 

In [48]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.1-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloa

In [54]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

X_train_reshaped = X_train.toarray().reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.toarray().reshape((X_test.shape[0], X_test.shape[1], 1))

model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train_reshaped.shape[1], 1)),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_reshaped, y_train, epochs=5, batch_size=32, validation_split=0.2)

loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/5
[1m1153/1153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 122ms/step - accuracy: 0.9385 - loss: 0.2513 - val_accuracy: 0.9446 - val_loss: 0.2143
Epoch 2/5
[1m1153/1153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 130ms/step - accuracy: 0.9449 - loss: 0.2285 - val_accuracy: 0.9446 - val_loss: 0.2152
Epoch 3/5
[1m1153/1153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 135ms/step - accuracy: 0.9433 - loss: 0.2296 - val_accuracy: 0.9446 - val_loss: 0.2147
Epoch 4/5
[1m1153/1153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 137ms/step - accuracy: 0.9424 - loss: 0.2283 - val_accuracy: 0.9446 - val_loss: 0.2142
Epoch 5/5
[1m1153/1153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 138ms/step - accuracy: 0.9449 - loss: 0.2196 - val_accuracy: 0.9446 - val_loss: 0.2151
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 40ms/step - accuracy: 0.9450 - loss: 0.2139
Test Loss: 0.21752749383449554
Test Accuracy: 0.943754