### Model Naiye Bayes Percobaan

In [164]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn import metrics

df=pd.read_csv('pre_title.csv')
df.head(10)

Unnamed: 0,title,url_berita,sentimen
0,strategi sleman dorong ekonomi pakai batik int...,https://20.detik.com/blak-blakan/20231204-2312...,1
1,greysia polii ajak anak muda terus melangkah r...,https://20.detik.com/demi-indonesia/20231027-2...,1
2,pan x jakcloth pesta anak nongkrong sambangi b...,https://20.detik.com/advertorial/20231109-2311...,1
3,arahan megawati rakernas iv pdip turun rakyatm...,https://20.detik.com/detikupdate/20231001-2310...,1
4,erick thohir ajak muda tak mager indonesia leb...,https://20.detik.com/demi-indonesia/20231027-2...,1
5,pdip sebut ganjar muliakan petani nelayan jadi...,https://20.detik.com/detikupdate/20231001-2310...,1
6,bertemu habib rizieq cak imin bantah bahas duk...,https://20.detik.com/detikupdate/20231001-2310...,1
7,kades bandung mundur dukung amin anies salut,https://20.detik.com/detikupdate/20231001-2310...,1
8,gestur prabowo ditanya soal kans berduet ganjar,https://20.detik.com/detikupdate/20231001-2310...,1
9,anies bertemu alumni itb bahas metode ilmiah m...,https://20.detik.com/detikupdate/20231001-2310...,1


In [165]:
def map_sentiment(sentiment):
    if sentiment == 1:
        return 'positif'
    elif sentiment == 0:
        return 'netral'
    else:
        return 'negatif'

In [166]:
df['sentimen'] = df['sentimen'].map(map_sentiment)

In [167]:
X = df['title']
y = df['sentimen']

In [168]:
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
    test_size=0.33,random_state=42)
    return train_X, test_X, train_Y, test_Y

norm_train_corpus, norm_test_corpus, train_labels, test_labels = prepare_datasets(X,y,test_data_proportion=0.3)

In [169]:
#Fungsi untuk mengekstraksi feature menggunakan TF-IDF Model
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                    smooth_idf=True,
                                    use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix
def tfidf_extractor(corpus, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [170]:
#mengunakan model TF-IDF untuk mengekstraksi feature
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

In [171]:
# Inisialisasi dan latih model Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train_features, train_labels)


In [172]:
predicted_labels = nb_classifier.predict(tfidf_test_features)

In [173]:
accuracy = accuracy_score(test_labels, predicted_labels)
print("Akurasi Model:", accuracy)

Akurasi Model: 0.6628282828282829


In [174]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted_labels))

              precision    recall  f1-score   support

     negatif       0.64      0.66      0.65      1637
      netral       0.69      0.60      0.64      1645
     positif       0.66      0.73      0.69      1668

    accuracy                           0.66      4950
   macro avg       0.66      0.66      0.66      4950
weighted avg       0.66      0.66      0.66      4950



In [175]:
test_titles = [
    'Jeka Saragih On Fire Menatap Debut di UFC',
    'Usai Golkar Dukung Prabowo-Gibran, Airlangga Bertemu Jokowi di Istana',
    'Usai Diperiksa, Firli Bicara Soal Karier di Polri Hingga Agus Rahardjo'
]

# Transformasi judul-judul menjadi representasi fitur menggunakan TfidfVectorizer
test_features = tfidf_vectorizer.transform(test_titles)

# Prediksi sentimen untuk judul-judul tersebut menggunakan model Naive Bayes
test_results = nb_classifier.predict(test_features)

# Konversi label numerik menjadi label teks
test_sentiments = [map_sentiment(sentiment) for sentiment in test_results]

# Menampilkan hasil prediksi untuk setiap judul
for title, sentiment in zip(test_titles, test_sentiments):
    print("Judul:", title)
    print("Sentimen:", sentiment)
    print()

Judul: Jeka Saragih On Fire Menatap Debut di UFC
Sentimen: negatif

Judul: Usai Golkar Dukung Prabowo-Gibran, Airlangga Bertemu Jokowi di Istana
Sentimen: negatif

Judul: Usai Diperiksa, Firli Bicara Soal Karier di Polri Hingga Agus Rahardjo
Sentimen: negatif



### Tambahan jika GridSearch

In [176]:
from sklearn.model_selection import GridSearchCV

In [177]:
nb_classifier = MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    }
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

In [178]:
grid_search.fit(tfidf_train_features, train_labels)

In [179]:
# Cetak hyperparameter terbaik yang ditemukan
print("Hyperparameter Terbaik:", grid_search.best_params_)

# Cetak akurasi model terbaik
print("Akurasi Model Terbaik:", grid_search.best_score_)

Hyperparameter Terbaik: {'alpha': 0.5}
Akurasi Model Terbaik: 0.6462686567164179


In [180]:
# Prediksi label pada data pengujian menggunakan model terbaik
best_nb_classifier = grid_search.best_estimator_
predicted_labels = best_nb_classifier.predict(tfidf_test_features)

# Evaluasi akurasi pada data pengujian
accuracy = accuracy_score(test_labels, predicted_labels)
print("Akurasi pada Data Pengujian:", accuracy)

Akurasi pada Data Pengujian: 0.6620202020202021


### Random Search CV

In [181]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [182]:
# Inisialisasi model Naive Bayes
nb_classifier = MultinomialNB()

# Daftar hyperparameter yang ingin disetel beserta distribusi nilai acaknya
param_dist = {'alpha': uniform(0, 10)}

In [183]:
# Inisialisasi RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=nb_classifier, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)

# Lakukan pencarian random
random_search.fit(tfidf_train_features, train_labels)

In [184]:
# Cetak hyperparameter terbaik yang ditemukan
print("Hyperparameter Terbaik:", random_search.best_params_)

# Cetak akurasi model terbaik
print("Akurasi Model Terbaik:", random_search.best_score_)


Hyperparameter Terbaik: {'alpha': 0.6505159298527952}
Akurasi Model Terbaik: 0.6469651741293533


In [185]:
# Prediksi label pada data pengujian menggunakan model terbaik
best_nb_classifier = random_search.best_estimator_
predicted_labels = best_nb_classifier.predict(tfidf_test_features)

# Evaluasi akurasi pada data pengujian
accuracy = accuracy_score(test_labels, predicted_labels)
print("Akurasi pada Data Pengujian:", accuracy)

Akurasi pada Data Pengujian: 0.6616161616161617
