<a href="https://colab.research.google.com/github/geraldalivia/LikeJennie-Decoding-Emotions-in-the-Spotlight/blob/main/Data_Scrapping_LikeJennie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SENTIMEN ANALYSIS
JENNIE - Like Jennie

## Install Depedencies and Import Library

In [None]:
# Install depedencies
!pip install pandas numpy scikit-learn



In [None]:
# Library
import pandas as pd
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Load Dataset

In [None]:
# Memuat dataset
files = {
    "mv": "1eeyZjQrqP0DJPccsMyU3U8UY57MKB5NM",
    "coachella": "1fVxuDUfpoJc7fJAX99GxjrgO4qjCNxo3",
    "npop": "1D60CO39z2BpHlIakdPaW6uScBkz-xsSS"
}

def read_drive_csv(file_id):
    url = f"https://drive.google.com/uc?id={file_id}"
    return pd.read_csv(url)

df = [read_drive_csv(file_id) for file_id in files.values()]

# Menggabungkan dataset
data = pd.concat(df, ignore_index=True)

# Menyimpan dataset
data.to_csv("dataset.csv", index=False)

In [None]:
# Menampilkan 5 baris pada data
data.head()

Unnamed: 0,comment_id,author,text,like_count,published_at,updated_at
0,UgxSA8Gw__bIpDyLzKR4AaABAg,@YouTube,☝ we wanna rock with jennie,235844,2025-03-07T18:29:47Z,2025-03-07T18:29:47Z
1,UgzVH3NG9V1pNF8iV5t4AaABAg,@KylieRubyjane,that capybara being jennie at the end!!! ICONI...,133624,2025-03-07T05:09:24Z,2025-03-07T05:09:24Z
2,UgxxOVNbkoYyD0eZ1HJ4AaABAg,@Iuvvria,this is the first time i&#39;ve been THIS exci...,73395,2025-03-06T17:02:46Z,2025-03-06T17:02:46Z
3,UgwnYjf5GDWdmpOPgZZ4AaABAg,@YocelynBegazo-j6w,Crazy how all this songs in this album can be ...,7,2025-05-25T23:49:01Z,2025-05-25T23:49:01Z
4,UgwSmmxWYPX-u8-V75F4AaABAg,@Yellenascovers,BROOO THE CAPYBARA ENDED MEEEEE.<br>But the so...,325,2025-03-07T10:29:04Z,2025-03-07T10:29:04Z


In [None]:
# Tinjau jumlah baris kolom dan jenis data dalam dataset dengan info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3793 entries, 0 to 3792
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_id    3793 non-null   object
 1   author        3791 non-null   object
 2   text          3793 non-null   object
 3   like_count    3793 non-null   int64 
 4   published_at  3793 non-null   object
 5   updated_at    3793 non-null   object
dtypes: int64(1), object(5)
memory usage: 177.9+ KB


## Preprocessing Data

In [None]:
# Menghapus kolom kecuali kolom author dan text
data = data.drop(columns=[col for col in data.columns if col not in ['text', 'author']])
data.head()

Unnamed: 0,author,text
0,@YouTube,☝ we wanna rock with jennie
1,@KylieRubyjane,that capybara being jennie at the end!!! ICONI...
2,@Iuvvria,this is the first time i&#39;ve been THIS exci...
3,@YocelynBegazo-j6w,Crazy how all this songs in this album can be ...
4,@Yellenascovers,BROOO THE CAPYBARA ENDED MEEEEE.<br>But the so...


In [None]:
# Prepocess Text
def preprocess_text(text):
    if pd.isna(text):
        return ""

    text = str(text).lower() # to lowercase
    text = re.sub(r'&#\d+;', '', text) # remove HTML
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # remove URLs
    text = re.sub(r'@\w+|#\w+', '', text) # remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text) # remove tanda baca dan numbers
    text = ' '.join(text.split()) # remove extra whitespace

    return text

In [None]:
data['clean_text'] = data['text'].apply(preprocess_text)

In [None]:
# Memberi label
positive_words = [
    "love", "amazing", "awesome", "beautiful", "great", "good", "nice", "incredible",
    "cool", "sweet", "like", "wonderful", "perfect", "adorable", "favorite", "talented",
    "legend", "slay", "iconic", "queen", "happy", "enjoy", "best", "vibe", "fire", "insane"
]

negative_words = [
    "hate", "bad", "worst", "boring", "ugly", "terrible", "awful", "annoying", "lame",
    "dislike", "cringe", "trash", "garbage", "weak", "overrated", "disappointed", "sucks",
    "poor", "mediocre", "meh", "waste", "stupid", "noisy", "fail"
]

def assign_label(text):
    text = text.lower()
    pos_count = sum(word in text for word in positive_words)
    neg_count = sum(word in text for word in negative_words)

    if pos_count > neg_count:
        return "positif"
    elif neg_count > pos_count:
        return "negatif"
    else:
        return "netral"

data['label'] = data['clean_text'].apply(assign_label)

In [None]:
data.head()

Unnamed: 0,author,text,clean_text,label
0,@YouTube,☝ we wanna rock with jennie,we wanna rock with jennie,netral
1,@KylieRubyjane,that capybara being jennie at the end!!! ICONI...,that capybara being jennie at the end iconiccc,positif
2,@Iuvvria,this is the first time i&#39;ve been THIS exci...,this is the first time ive been this excited f...,netral
3,@YocelynBegazo-j6w,Crazy how all this songs in this album can be ...,crazy how all this songs in this album can be ...,positif
4,@Yellenascovers,BROOO THE CAPYBARA ENDED MEEEEE.<br>But the so...,brooo the capybara ended meeeeebrbut the song ...,netral


In [None]:
# Memisahkan Fitur dan Taregt
X = data['clean_text']
y = data['label']

## First Schema

In [None]:
# Menggunakan SVM + TF-IDF dengan split 80/20
first_model = TfidfVectorizer()
X1 = first_model.fit_transform(X)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y, test_size=0.2, random_state=42)

clf1 = SVC(kernel='linear')
clf1.fit(X_train1, y_train1)
pred1 = clf1.predict(X_test1)
print("\n[Skema 1] SVM + TF-IDF")
print("Akurasi:", accuracy_score(y_test1, pred1))
print(classification_report(y_test1, pred1))


[Skema 1] SVM + TF-IDF
Akurasi: 0.9367588932806324
              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00         9
      netral       0.92      0.99      0.95       495
     positif       0.97      0.87      0.92       255

    accuracy                           0.94       759
   macro avg       0.63      0.62      0.62       759
weighted avg       0.93      0.94      0.93       759



## Second Schema

In [None]:
# Menggunakan Random Forest + TF-IDF dengan Split 70/30
second_model = TfidfVectorizer(ngram_range=(1,2))
X2 = second_model.fit_transform(X)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size=0.3, random_state=42)

clf2 = RandomForestClassifier()
clf2.fit(X_train2, y_train2)
pred2 = clf2.predict(X_test2)
print("\n[Skema 2] Random Forest + TF-IDF")
print("Akurasi:", accuracy_score(y_test2, pred2))
print(classification_report(y_test2, pred2))


[Skema 2] Random Forest + TF-IDF
Akurasi: 0.9235500878734623
              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00        13
      netral       0.91      0.99      0.95       750
     positif       0.96      0.83      0.89       375

    accuracy                           0.92      1138
   macro avg       0.62      0.60      0.61      1138
weighted avg       0.92      0.92      0.92      1138



## Third Schema

In [None]:
# Menggunakan Logistic Regression + TF-IDF dengan Split 80/20
third_model = TfidfVectorizer()
X3 = third_model.fit_transform(X)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y, test_size=0.2, random_state=42)

clf3 = LogisticRegression(max_iter=1000)
clf3.fit(X_train3, y_train3)
pred3 = clf3.predict(X_test3)
print("\n[Skema 3] Logistic Regression + TF-IDF")
print("Akurasi:", accuracy_score(y_test3, pred3))
print(classification_report(y_test3, pred3))


[Skema 3] Logistic Regression + TF-IDF
Akurasi: 0.9209486166007905
              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00         9
      netral       0.90      0.99      0.94       495
     positif       0.97      0.82      0.89       255

    accuracy                           0.92       759
   macro avg       0.62      0.60      0.61       759
weighted avg       0.91      0.92      0.91       759



## Inference Testing

In [None]:
def predict_sentiment(text):
    clean = preprocess_text(text)
    vec = second_model.transform([clean])  # schema 2
    pred = clf2.predict(vec)
    return pred[0]

# Tes Inference
komentar = "Wanna rock with Jennie"
print(f"Komentar: {komentar} → Sentimen: {predict_sentiment(komentar)}")

Komentar: Wanna rock with Jennie → Sentimen: netral


In [None]:
def predict_sentiment(text):
    clean = preprocess_text(text)
    vec = third_model.transform([clean])  # schema 1
    pred = clf3.predict(vec)
    return pred[0]

# Tes Inference
komentar = "Thats, Special edition and your AI could'nt copy"
print(f"Komentar: {komentar} → Sentimen: {predict_sentiment(komentar)}")

Komentar: Thats, Special edition and your AI could'nt copy → Sentimen: netral


In [None]:
def predict_sentiment(text):
    clean = preprocess_text(text)
    vec = first_model.transform([clean])  # schema 1
    pred = clf1.predict(vec)
    return pred[0]

# Tes Inference
komentar = "Jennie being Jennie"
print(f"Komentar: {komentar} → Sentimen: {predict_sentiment(komentar)}")

Komentar: Jennie being Jennie → Sentimen: netral
