In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install pyspark



In [None]:
pip install mlflow



# Import Library

In [None]:
# Import Libraries
import re
import pandas as pd
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy

import pyspark
from pyspark.sql import SparkSession
import mlflow
import sys
import time
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import mlflow.spark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [None]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Spark Session

In [None]:
# Inisialisasi sesi Spark
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

In [None]:
spark

# Import Data

In [None]:
df = spark.read.csv('/content/drive/MyDrive/DataABD/Review Gojek.csv', header=True, inferSchema=True)
df.show(5)

+------------------+-----+---------------+--------------------+
|          userName|score|             at|             content|
+------------------+-----+---------------+--------------------+
|     Chalid Ismail|    5|10/30/2022 8:20|    Really Satisfied|
|     irwan saputra|    5|10/30/2022 8:12|               Keren|
|kurniawati tanamal|    1|10/30/2022 8:10|tambahin feature ...|
|           ui hoon|    5|10/30/2022 8:05|                Good|
|  christian darren|    1|10/30/2022 7:53|Promo tidak bisa ...|
+------------------+-----+---------------+--------------------+
only showing top 5 rows



In [None]:
print("Jumlah Baris:", df.count())
print("Jumlah Kolom:", len(df.columns))

Jumlah Baris: 1041818
Jumlah Kolom: 4


In [None]:
df.describe().show()

+-------+--------------------+--------------------+--------------------+--------------------+
|summary|            userName|               score|                  at|             content|
+-------+--------------------+--------------------+--------------------+--------------------+
|  count|             1041815|             1038023|             1036831|             1036514|
|   mean|                 NaN|  3.8403839530102335|                 2.0|8.293103450151043...|
| stddev|                 NaN|  1.6597488974061831|                NULL|6.315839420527601...|
|    min|!!! LU TAU GAK IT...|                    |                    |                    |
|    max|               🤣🤣"|yg satu ramah dan...|uda malam2 pergi ...|                🫶🫶|
+-------+--------------------+--------------------+--------------------+--------------------+



In [None]:
df.printSchema()

root
 |-- userName: string (nullable = true)
 |-- score: string (nullable = true)
 |-- at: string (nullable = true)
 |-- content: string (nullable = true)



In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

# Mengubah tipe data kolom "score" menjadi integer
df = df.withColumn("Score", col("Score").cast(IntegerType()))

In [None]:
df.printSchema()

root
 |-- userName: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- at: string (nullable = true)
 |-- content: string (nullable = true)



In [None]:
df.tail(5)

[Row(userName='Chenel Dakwah', Score=5, at='5/16/2024 5:14', content='Ramah dan tepat waktu'),
 Row(userName='Arafah HerCom', Score=5, at='5/16/2024 5:09', content='mantaf'),
 Row(userName='Iwan Kurniawan', Score=5, at='5/16/2024 5:06', content="Yang dekat aja, cs'nya cancelin mulu, saking gg sabarnya nunggu driver"),
 Row(userName='Siti Marifah', Score=5, at='5/16/2024 4:50', content='keren'),
 Row(userName='Alvino Ifandi', Score=5, at='5/16/2024 4:49', content='aplikasi yang bagus ini')]

# Data Preprocessing

In [None]:
#Drop missing values
df = df.dropna()

df.show(5)

+------------------+-----+---------------+--------------------+
|          userName|Score|             at|             content|
+------------------+-----+---------------+--------------------+
|     Chalid Ismail|    5|10/30/2022 8:20|    Really Satisfied|
|     irwan saputra|    5|10/30/2022 8:12|               Keren|
|kurniawati tanamal|    1|10/30/2022 8:10|tambahin feature ...|
|           ui hoon|    5|10/30/2022 8:05|                Good|
|  christian darren|    1|10/30/2022 7:53|Promo tidak bisa ...|
+------------------+-----+---------------+--------------------+
only showing top 5 rows



In [None]:
df.count()

1036445

In [None]:
# Melihat banyaknya duplikasi data
print("Jumlah duplikasi data:", df.count()-df.dropDuplicates().count())

Jumlah duplikasi data: 339604


In [None]:
# Menghilangkan duplikasi data
df1 = df.dropDuplicates()

# Menampilkan jumlah data bersih
print("Jumlah data bersih setelah menghilangkan duplikasi:", df1.count())
df1.show(5)

Jumlah data bersih setelah menghilangkan duplikasi: 696841
+--------------+-----+----------------+--------------------+
|      userName|Score|              at|             content|
+--------------+-----+----------------+--------------------+
| Rizky Gunawan|    4|10/29/2022 12:49|Can you make keyp...|
|      Bill McG|    5| 10/24/2022 5:41|Always reliable b...|
|Rendzy Widjaya|    3| 10/17/2022 9:12|Aplikasi lemot, a...|
|   Agung Setya|    5| 10/17/2022 5:18|            membantu|
|           T E|    3|10/12/2022 23:56|Please remove aut...|
+--------------+-----+----------------+--------------------+
only showing top 5 rows



# Labelling

In [None]:
from pyspark.sql.functions import when

# Menambahkan kolom "sentiment" berdasarkan Score
df1 = df1.withColumn("sentiment",
                   when(df1["score"] == 3, "netral")
                   .when(df1["score"] > 3, "positif")
                   .when(df1["score"] < 3, "negatif"))

In [None]:
df1.show(5)

+--------------+-----+----------------+--------------------+---------+
|      userName|Score|              at|             content|sentiment|
+--------------+-----+----------------+--------------------+---------+
| Rizky Gunawan|    4|10/29/2022 12:49|Can you make keyp...|  positif|
|      Bill McG|    5| 10/24/2022 5:41|Always reliable b...|  positif|
|Rendzy Widjaya|    3| 10/17/2022 9:12|Aplikasi lemot, a...|   netral|
|   Agung Setya|    5| 10/17/2022 5:18|            membantu|  positif|
|           T E|    3|10/12/2022 23:56|Please remove aut...|   netral|
+--------------+-----+----------------+--------------------+---------+
only showing top 5 rows



In [None]:
# Filter baris yang memiliki nilai sentiment tidak null
df1_filtered = df1.filter(col("sentiment").isNotNull())

In [None]:
# Tampilkan jumlah sentimen positif, netral, dan negatif
df1_filtered.groupBy("sentiment").count().show()

+---------+------+
|sentiment| count|
+---------+------+
|  negatif|191011|
|  positif|468990|
|   netral| 36840|
+---------+------+



# Text Preprocessing

In [None]:
from pyspark.sql.functions import regexp_replace, lower
from pyspark.ml.feature import Tokenizer

# Mengganti URL dengan spasi
df1 = df1.withColumn('content', regexp_replace(df1['content'], 'https\S+', ' '))
# Mengubah teks menjadi huruf kecil
df1 = df1.withColumn('content', lower(df1['content']))
# Menghapus kata yang diawali dengan @
df1 = df1.withColumn('content', regexp_replace(df1['content'], '@\S+', ' '))
# Menghapus kata yang diawali dengan #
df1 = df1.withColumn('content', regexp_replace(df1['content'], '#\S+', ' '))
# Menghapus kata yang berawalan dengan tanda kutip tunggal
df1 = df1.withColumn('content', regexp_replace(df1['content'], "\'\w+", ' '))
# Menghapus karakter non-huruf dan non-spasi
df1 = df1.withColumn('content', regexp_replace(df1['content'], "[^\w\s]", ' '))
# Menghapus spasi berlebih (2 spasi atau lebih)
df1 = df1.withColumn('content', regexp_replace(df1['content'], "\s{2,}", ' '))

# Menggunakan Tokenizer untuk membagi teks menjadi token
tokenizer = Tokenizer(inputCol='content', outputCol='content_token')
df1 = tokenizer.transform(df1)

df1.show(5)

+--------------+-----+----------------+--------------------+---------+--------------------+
|      userName|Score|              at|             content|sentiment|       content_token|
+--------------+-----+----------------+--------------------+---------+--------------------+
| Rizky Gunawan|    4|10/29/2022 12:49|can you make keyp...|  positif|[can, you, make, ...|
|      Bill McG|    5| 10/24/2022 5:41|always reliable b...|  positif|[always, reliable...|
|Rendzy Widjaya|    3| 10/17/2022 9:12|aplikasi lemot ap...|   netral|[aplikasi, lemot,...|
|   Agung Setya|    5| 10/17/2022 5:18|            membantu|  positif|          [membantu]|
|           T E|    3|10/12/2022 23:56|please remove aut...|   netral|[please, remove, ...|
+--------------+-----+----------------+--------------------+---------+--------------------+
only showing top 5 rows



Tokenizer

In [None]:
from pyspark.ml.feature import Tokenizer

# Tokenisasi kolom 'content'
tokenizer = Tokenizer(inputCol="content", outputCol="words")
tokenized_df = tokenizer.transform(df1)

# Menampilkan hasil tokenisasi
tokenized_df.select("content", "words").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|content                                                                                                                                                                                                                                                              |words                                                                                                                                                              

Filtering (StopWord Removal)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType
import nltk
from nltk.corpus import stopwords

# Define the list of Indonesian stopwords
nltk.download('stopwords')
stopwords_list = stopwords.words("indonesian")

# Extend the list with custom stopwords
my_stopwords = ['gojek']
stopwords_list.extend(my_stopwords)

# Define UDF to remove stopwords
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords_list]

remove_stopwords_udf = udf(remove_stopwords, ArrayType(StringType()))

# Assume df is your Spark DataFrame and 'content_token' is the column with tokenized text
df1 = df1.withColumn("content_token", remove_stopwords_udf(col("content_token")))

# Show the first 3 rows of the DataFrame
df1.show(5)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+--------------+-----+----------------+--------------------+---------+--------------------+
|      userName|Score|              at|             content|sentiment|       content_token|
+--------------+-----+----------------+--------------------+---------+--------------------+
| Rizky Gunawan|    4|10/29/2022 12:49|can you make keyp...|  positif|[can, you, make, ...|
|      Bill McG|    5| 10/24/2022 5:41|always reliable b...|  positif|[always, reliable...|
|Rendzy Widjaya|    3| 10/17/2022 9:12|aplikasi lemot ap...|   netral|[aplikasi, lemot,...|
|   Agung Setya|    5| 10/17/2022 5:18|            membantu|  positif|          [membantu]|
|           T E|    3|10/12/2022 23:56|please remove aut...|   netral|[please, remove, ...|
+--------------+-----+----------------+--------------------+---------+--------------------+
only showing top 5 rows



Stemming

In [None]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m143.4/209.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Create Sastrawi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Define UDF to stem each word
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

stem_tokens_udf = udf(stem_tokens, ArrayType(StringType()))

# Apply stemming UDF to DataFrame
df1 = df1.withColumn("stemmed", stem_tokens_udf(col("content_token")))

# Show the first 5 rows of the DataFrame
df1.show(5)

+--------------+-----+----------------+--------------------+---------+--------------------+--------------------+
|      userName|Score|              at|             content|sentiment|       content_token|             stemmed|
+--------------+-----+----------------+--------------------+---------+--------------------+--------------------+
| Rizky Gunawan|    4|10/29/2022 12:49|can you make keyp...|  positif|[can, you, make, ...|[can, you, make, ...|
|      Bill McG|    5| 10/24/2022 5:41|always reliable b...|  positif|[always, reliable...|[always, reliable...|
|Rendzy Widjaya|    3| 10/17/2022 9:12|aplikasi lemot ap...|   netral|[aplikasi, lemot,...|[aplikasi, lot, k...|
|   Agung Setya|    5| 10/17/2022 5:18|            membantu|  positif|          [membantu]|             [bantu]|
|           T E|    3|10/12/2022 23:56|please remove aut...|   netral|[please, remove, ...|[please, remove, ...|
+--------------+-----+----------------+--------------------+---------+--------------------+-----

In [None]:
# Fungsi UDF untuk menggabungkan token dengan panjang > 3 menjadi string
def join_tokens(tokens):
    return ' '.join([token for token in tokens if len(token) > 3])

# Membuat UDF
join_tokens_udf = udf(join_tokens, StringType())

# Menggunakan UDF untuk membuat kolom baru 'text_string'
df1 = df1.withColumn('text_string', join_tokens_udf(col('stemmed')))

# Menampilkan hasil
df1.show(truncate=False)

+------------------+-----+----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Konversi tipe data kolom 'text_string' menjadi StringType
df1 = df1.withColumn("text_string", col("text_string").cast("string"))

# Tampilkan tipe data kolom setelah konversi
df1.dtypes

[('userName', 'string'),
 ('Score', 'int'),
 ('at', 'string'),
 ('content', 'string'),
 ('sentiment', 'string'),
 ('content_token', 'array<string>'),
 ('stemmed', 'array<string>'),
 ('text_string', 'string')]

In [None]:
from pyspark.sql.types import IntegerType

# Konversi tipe data kolom 'score' menjadi IntegerType
df1 = df1.withColumn("score", df1["score"].cast(IntegerType()))

# Tampilkan tipe data kolom setelah konversi
df1.dtypes

[('userName', 'string'),
 ('score', 'int'),
 ('at', 'string'),
 ('content', 'string'),
 ('sentiment', 'string'),
 ('content_token', 'array<string>'),
 ('stemmed', 'array<string>'),
 ('text_string', 'string')]

Word Count

In [None]:
from pyspark.sql.functions import udf, col, split, size, explode, count
from pyspark.sql.types import StringType

In [None]:
# Menghitung jumlah kata untuk setiap baris di kolom 'text_string'
df1 = df1.withColumn('word_count', size(split(col('text_string'), ' ')))

# Menampilkan hasil dengan word count per baris
df1.show(truncate=False)

+------------------+-----+----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.sql.functions import explode, col

# Menghitung frekuensi tiap kata yang muncul
word_freq_df = tokenized_df.select(explode("words").alias("word")).groupBy("word").count()

# Visualisasi frekuensi kata yang muncul
word_freq_df.orderBy(col("count").desc()).show()

# Wordcloud

In [None]:
pip install wordcloud



Sentimen Positif

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Filter DataFrame untuk score lebih dari 3
df_p = df.filter(col('Score') > 3)

# Menggabungkan semua teks dari kolom 'content' menjadi satu string besar
all_words_lem = " ".join([row['content'] for row in df_p.select('content').collect()])

# Cek apakah all_words_lem tidak kosong sebelum membuat word cloud
if all_words_lem.strip():  # Memastikan string tidak hanya berisi spasi putih
    # Membuat word cloud dari teks positif
    wordcloud = WordCloud(background_color='white', width=800, height=500, random_state=21, max_font_size=130).generate(all_words_lem)

    # Menampilkan word cloud menggunakan matplotlib
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
else:
    print("No text available to generate word cloud.")

Sentimen Negatif

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Filter DataFrame untuk score lebih dari 3
df_p = df.filter(col('Score') < 3)

# Menggabungkan semua teks dari kolom 'content' menjadi satu string besar
all_words_lem = " ".join([row['content'] for row in df_p.select('content').collect()])

# Cek apakah all_words_lem tidak kosong sebelum membuat word cloud
if all_words_lem.strip():  # Memastikan string tidak hanya berisi spasi putih
    # Membuat word cloud dari teks positif
    wordcloud = WordCloud(background_color='white', width=800, height=500, random_state=21, max_font_size=130).generate(all_words_lem)

    # Menampilkan word cloud menggunakan matplotlib
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
else:
    print("No text available to generate word cloud.")


Sentimen Netral

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Filter DataFrame untuk score lebih dari 3
df_p = df.filter(col('Score') == 3)

# Menggabungkan semua teks dari kolom 'content' menjadi satu string besar
all_words_lem = " ".join([row['content'] for row in df_p.select('content').collect()])

# Cek apakah all_words_lem tidak kosong sebelum membuat word cloud
if all_words_lem.strip():  # Memastikan string tidak hanya berisi spasi putih
    # Membuat word cloud dari teks positif
    wordcloud = WordCloud(background_color='white', width=800, height=500, random_state=21, max_font_size=130).generate(all_words_lem)

    # Menampilkan word cloud menggunakan matplotlib
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
else:
    print("No text available to generate word cloud.")
