## Import Library

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import re
from google.colab import drive

## Loading Dataset

In [None]:
drive.mount('/content/drive')

In [None]:
main_df = pd.read_csv('/content/drive/MyDrive/ulasan_aplikasi.csv')

In [None]:
main_df.info()
print('Jumlah nilai terdupliasi: ', main_df.duplicated().sum())

In [None]:
main_df.head()

In [None]:
print(main_df.shape)

## Preprocessing Text

### **Case Folding**
Proses mengubah semua huruf dalam teks menjadi huruf kecil atau huruf besar agar konsisten. Misalnya, mengubah "TeKS" menjadi "teks" atau "TEKS".

In [None]:
main_df.loc[:, 'Review'] = main_df.loc[:, 'Review'].str.lower()
main_df['Review']

### **Removal Special Characters**
Menghapus karakter khusus atau simbol yang tidak relevan atau tidak diinginkan dari teks.
* Menghapus Angka
* Menghapus Tanda Baca
* Menghapus Garis Baru
* Menghapus Spasi Tambahan Di Awal dan Akhir Teks

In [None]:
#Menghapus Angka
for i in range(len(main_df)):
    main_df.loc[i, 'Review'] = re.sub(r'\d+', '', str(main_df.loc[i, 'Review']))
main_df['Review']

In [None]:
# Menghapus Tanda Baca
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for i in range(len(main_df)):
    text = main_df.loc[i, 'Review']
    for char in punc:
        text = text.replace(char, '')
    main_df.loc[i, 'Review'] = text
main_df['Review']

In [None]:
# Menghapus Mention dan Hastag
for i in range(len(main_df)):
    main_df.loc[i, 'Review'] = re.sub(r'@[A-Za-z0-9]+', '', main_df.loc[i, 'Review'])
    main_df.loc[i, 'Review'] = re.sub(r'#[A-Za-z0-9]+', '', main_df.loc[i, 'Review'])
main_df['Review']

In [None]:
# Menghapus dan Spasi
main_df['Review'] = main_df.loc[:, 'Review'].str.strip()
main_df['Review']

### **Stopword Removal (Filtering)**
Menghapus kata-kata yang umumnya tidak memberikan nilai tambah dalam analisis teks, seperti "dan", "atau", "yang", dll.

#### Indonesia

#### English

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
print(stopwords.words('english'))

In [None]:
def filteringText(text):
    listStopwords = set(stopwords.words('english'))
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text):
    words = text.split()
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

def toSentence(list_words):
    sentence = ' '.join(word for word in list_words)
    return sentence

### **Stemming**
Proses menghapus imbuhan dari kata untuk mengembalikannya ke bentuk dasarnya. Misalnya, mengubah "berlari", "berlarian" menjadi "lari".


### **Tokenizing**
Proses membagi teks menjadi bagian-bagian lebih kecil yang disebut token.


### **Lemmatization**
Proses mengubah kata-kata ke bentuk dasarnya (lema) dengan mempertimbangkan konteks dan struktur bahasa. Misalnya, mengubah "menyanyikan" menjadi "nyanyi".

## Labeling

### English

Source: https://thecleverprogrammer.com/2021/11/24/add-labels-to-a-dataset-for-sentiment-analysis/

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

In [None]:
sentiments = SentimentIntensityAnalyzer()
main_df["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in main_df["Review"]]
main_df["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in main_df["Review"]]
main_df["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in main_df["Review"]]
main_df['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in main_df["Review"]]
main_df.head()

In [None]:
score = main_df["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
main_df["Sentiment"] = sentiment
main_df.head()

In [None]:
print(main_df['Sentiment'].value_counts())

### Indonesia

In [None]:
import csv
import requests
from io import StringIO

# Membaca data kamus kata-kata positif dari GitHub
lexicon_positive = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub

if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma

    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_positive[row[0]] = int(row[1])
        # Menambahkan kata-kata positif dan skornya ke dalam kamus lexicon_positive
else:
    print("Failed to fetch positive lexicon data")

# Membaca data kamus kata-kata negatif dari GitHub
lexicon_negative = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub

if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma

    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_negative[row[0]] = int(row[1])
        # Menambahkan kata-kata negatif dan skornya dalam kamus lexicon_negative
else:
    print("Failed to fetch negative lexicon data")

In [None]:
# Fungsi untuk menentukan polaritas sentimen dari tweet

def sentiment_analysis_lexicon_indonesia(text):
    #for word in text:

    score = 0
    # Inisialisasi skor sentimen ke 0

    for word in text:
        # Mengulangi setiap kata dalam teks

        if (word in lexicon_positive):
            score = score + lexicon_positive[word]
            # Jika kata ada dalam kamus positif, tambahkan skornya ke skor sentimen

    for word in text:
        # Mengulangi setiap kata dalam teks (sekali lagi)

        if (word in lexicon_negative):
            score = score + lexicon_negative[word]
            # Jika kata ada dalam kamus negatif, kurangkan skornya dari skor sentimen

    polarity=''
    # Inisialisasi variabel polaritas

    if (score >= 0):
        polarity = 'positive'
        # Jika skor sentimen lebih besar atau sama dengan 0, maka polaritas adalah positif
    elif (score < 0):
        polarity = 'negative'
        # Jika skor sentimen kurang dari 0, maka polaritas adalah negatif
    else:
        polarity = 'neutral'
    # Ini adalah bagian yang bisa digunakan untuk menentukan polaritas netral jika diperlukan

    return score, polarity
    # Mengembalikan skor sentimen dan polaritas teks

In [None]:
results = main_df['content'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
main_df['polarity_score'] = results[0]
main_df['polarity'] = results[1]
print(main_df['polarity'].value_counts())

# Ekstrasi Fitur

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X = main_df['Review']
y = main_df['Sentiment']

In [None]:
tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)

In [None]:
features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
features_df

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

### Modeling (Machine Learning)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
models={'LogisticRegression()':LogisticRegression(),
        'K-Nearest Neighbors': KNeighborsClassifier(),
        'Decision Tree':DecisionTreeClassifier(),
        'Support Vector Machine(Linear Kernel)':LinearSVC(),
        'Support Vector Machine(Non-Linear Kernal)':SVC(),
        'Neural Network':MLPClassifier(),
        'Random Forest':RandomForestClassifier(),
        'Gradient Boosting':GradientBoostingClassifier()}

In [None]:
for name, model in models.items():
    print(name)
    model.fit(X_train,y_train)
    print(model.score(X_test,y_test))