# Install libraries

In [None]:
!pip install scikit-learn transformers torch pandas numpy wordcloud matplotlib Sastrawi

# Download training dataset

In [None]:
# Download IndoNLP dataset
!wget "https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv"

# Download cahkanor dataset
!wget "https://raw.githubusercontent.com/cahkanor/id-tourism-sentimentanalysis/refs/heads/master/id-tourism-sentimentanalysis.tsv"

# Download Sastrawi dataset (opsional, jika ingin ada normalization)
!wget "https://raw.githubusercontent.com/sastrawi/sastrawi/refs/heads/master/data/kata-dasar.txt"

# Download All_Reviews.csv (dataset pribadi hasil penggabungan ulasan 10 air terjun di Bandung Raya)
!wget "https://raw.githubusercontent.com/dani-fadli/google-maps-reviews-scraper/refs/heads/main/datasets/All_Reviews.csv"

# (Optional) Load model
By loading saved model (svm_model.joblib), you can skip to the Predict sentiment step

In [None]:
# Download saved model from GitHub
# !wget "https://github.com/dani-fadli/google-maps-reviews-scraper/raw/main/svm_model.joblib"

# Load model
# import joblib

# Memuat model dari file
# svm_model = joblib.load('svm_model.joblib')

# Load libraries

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover

# Prefer to use GPU instead of CPU

In [None]:
# In Colab, you can change the runtime type to GPU for better performance
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Data preprocessing

In [None]:
SLANG_DICT = {
    "bgs": "bagus",
    "bgt": "banget",
    "ngga": "tidak",
    "ga": "tidak",
    "gk": "tidak",
    "tdk": "tidak",
    "nggak": "tidak",
    "gak": "tidak",
    "pokoknyaa": "pokoknya",
    "mantul": "mantap betul",
    "aja": "saja",
    "udh": "sudah",
    "lg": "lagi",
    "sm": "sama",
    "dr": "dari",
    "org": "orang",
    "kesini": "ke sini",
    "jauh2": "jauh-jauh",
    "tp": "tapi",
    "jg": "juga",
    "dlm": "dalam",
    "utk": "untuk",
    "bnyk": "banyak",
    "yg": "yang"
    # ...add more slang dictionary...
}

# Load stopwords from Sastrawi and custom stopwords
def load_all_stopwords(for_wordclouds=False):
    factory = StopWordRemoverFactory()
    sastrawi_stopwords = set(factory.get_stop_words())

    custom_stopwords = set([
        "yg", "nya"
    ])

    if for_wordclouds:
      all_stopwords = sastrawi_stopwords.union(custom_stopwords)
    else:
      all_stopwords = (sastrawi_stopwords - set(SLANG_DICT.values())).union(custom_stopwords)

    return all_stopwords

# Load stopwords

def preprocess_pipeline(text: str, for_wordclouds=False) -> str:
    # Langkah 1: Case Folding
    text = text.lower()

    # Langkah 2: Menghapus noise
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Hapus URL
    text = re.sub(r'\@\w+|\#', '', text) # Hapus mention dan hashtag
    text = re.sub(r'\d+', '', text) # Hapus angka

    # Langkah 3: Menghapus Tanda Baca
    # Hanya menyisakan huruf dan spasi
    text = re.sub(r'[^\w\s]', '', text)

    # Langkah 4: Tokenisasi
    tokens = text.split()

    # Langkah 5: Normalisasi Kata
    # Menggunakan .get() agar tidak error jika kata tidak ada di kamus
    normalized_tokens = [SLANG_DICT.get(token, token) for token in tokens]

    # Langkah 6: Menghapus Stopwords
    ALL_STOPWORDS = load_all_stopwords(for_wordclouds)
    final_tokens = [token for token in normalized_tokens if token not in ALL_STOPWORDS]

    # Langkah 7: Menggabungkan kembali menjadi teks
    # Juga menghapus spasi berlebih yang mungkin muncul
    clean_text = " ".join(final_tokens)
    return clean_text.strip()

reviews_data = {
    'raw_data': [
        "Tempatnya bgs bgt! Gk nyesel kesini",
        "Pelayanannya buruk, ga bakal kesini lg 😠",
        "Makanannya enak sm harganya murah bgt!",
        "saya Makan nasi di warung dan minum nya es teh yg manis",
        "tempatnya bgs bgt, ngga nyesel deh pokoknyaa mantul",
        "Viewnya kerennn, airnya seger. Cuman sayang bnyk sampah. Tolong dong pengelolanya."
    ]
}

df = pd.DataFrame(reviews_data)
df['preprocessed_data'] = df['raw_data'].apply(preprocess_pipeline)
print(df)

# Load dataset and load BERT tokenizer

In [None]:
# Load the first dataset
training_data = pd.read_csv("train_preprocess.tsv", sep="\t", header=None)  # Sesuaikan dengan path dataset

# Load the second dataset with header and appropriate columns
tourism_data = pd.read_csv("id-tourism-sentimentanalysis.tsv", sep="\t") # Header is automatically detected

# Select only the 'review' and 'sentiment' columns from the second dataset
tourism_data = tourism_data[['review', 'sentiment']]

# Rename columns in the second dataset to match the first dataset's structure
# This is important for proper concatenation if columns are implicitly ordered
tourism_data.columns = [0, 1] # Assuming first column is text (review) and second is label (sentiment)

# Combine the two dataframes
combined_data = pd.concat([training_data, tourism_data], ignore_index=True)

# Extract texts and labels from the combined data
texts = combined_data.iloc[:, 0].tolist()
labels = combined_data.iloc[:, 1].tolist()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-large-p1") # lite, base, large
model = AutoModel.from_pretrained("indobenchmark/indobert-large-p1") # lite, base, large
model = model.to(device)

# Model citation
# @inproceedings{wilie2020indonlu,
#   title={IndoNLU: Benchmark and Resources for Evaluating Indonesian Natural Language Understanding},
#   author={Bryan Wilie and Karissa Vincentio and Genta Indra Winata and Samuel Cahyawijaya and X. Li and Zhi Yuan Lim and S. Soleman and R. Mahendra and Pascale Fung and Syafri Bahar and A. Purwarianti},
#   booktitle={Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing},
#   year={2020}
# }

# Run BERT embeddings

In [None]:
# Function to get embeddings in batch
def get_bert_embeddings(text_list, tokenizer, model, batch_size=16):
    all_embeddings = []
    model.eval()
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_texts = text_list[i:i+batch_size]
        tokens = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        tokens = {key: val.to(device) for key, val in tokens.items()}
        with torch.no_grad():
            outputs = model(**tokens)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.extend(batch_embeddings)
    return np.array(all_embeddings)

X = get_bert_embeddings(texts, tokenizer, model)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM

In [None]:
# Train SVM model
svm_model = SVC(kernel='rbf') # linear, poly, rbf
svm_model.fit(X_train, y_train)

# Evaluate model
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Predict sentiment

In [None]:
# Load scraped reviews
your_data = pd.read_csv("All_Reviews.csv")
your_data = your_data[~your_data["review"].isin(["Review not found", "", None])]
review_texts = your_data["review"].apply(preprocess_pipeline).tolist()

# Convert scraped reviews to embeddings
X_scraped = get_bert_embeddings(review_texts, tokenizer, model)

# Predict sentiment for scraped reviews
y_scraped_pred = svm_model.predict(X_scraped)
your_data["predicted_sentiment"] = y_scraped_pred

# Save results
your_data.to_csv("classified_reviews.csv", index=False)


# Make wordclouds

In [None]:
# Wordcloud per label
label_map = {0: "Negatif", 1: "Netral", 2: "Positif"}
for label in sorted(your_data["predicted_sentiment"].unique()):
    subset = your_data[your_data["predicted_sentiment"] == label]
    text = " ".join(subset["review"].apply(preprocess_pipeline, for_wordclouds=True).tolist())
    wordcloud = WordCloud(width=800, height=400, background_color="white", collocations=False).generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud - Sentimen {label_map.get(label, label)}")
    plt.show()

# Visualize sentiment distributions

In [None]:
sentiment_counts = your_data['predicted_sentiment'].value_counts()

print("Sentiment Distribution Counts:")
print(sentiment_counts)

# Reorder sentiment counts
ordered_sentiment_counts = sentiment_counts.reindex(['positive', 'neutral', 'negative'])

plt.figure(figsize=(8, 6))
ax = ordered_sentiment_counts.plot(kind='bar')
plt.title('Distribution of Predicted Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.xticks(ticks=[0, 1, 2], labels=['Positive', 'Neutral', 'Negative'], rotation=0)

# Add data labels to the bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

plt.show()

# Save model

In [None]:
import joblib

# Save model to file 'svm_model.joblib'
joblib.dump(svm_model, 'svm_model.joblib')

print("Model has been saved!")