In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack

In [31]:
def map_sentiment(sentiment):
    if sentiment == 1:
        return 'positif'
    elif sentiment == 0:
        return 'netral'
    else:
        return 'negatif'

In [32]:
def prepare_datasets(corpus, labels, test_data_proportion=0.3, random_state=42):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                        test_size=test_data_proportion,
                                                        random_state=random_state)
    return train_X, test_X, train_Y, test_Y

In [33]:
def tfidf_extractor(corpus, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [34]:
df = pd.read_csv('sentimen_pemilu_manual_1500.csv')
df['sentimen'] = df['sentimen'].map(map_sentiment)
df.head(10)

Unnamed: 0,content,sentimen
0,kumaha barudak well yuk ramaikan pesta anak no...,positif
1,anies baswedan peringkat ketiga hasil survei a...,positif
2,anies baswedan muhaimin iskandar menyambangi k...,positif
3,cawapres koalisi perubahan muhaimin iskandar c...,positif
4,capres koalisi pdip ganjar pranowo khawatir su...,positif
5,capres ganjar pranowo mengaku khawatir preside...,positif
6,capres koalisi perubahan anies baswedan presid...,positif
7,capres koalisi perubahan anies baswedan mengak...,positif
8,cawapres koalisi indonesia maju kim gibran rak...,positif
9,capres koalisi perubahan anies baswedan menarg...,positif


In [35]:
# Split data
x_train, x_test, y_train, y_test = prepare_datasets(df['content'], df['sentimen'], test_data_proportion=0.3)

In [36]:
# Convert text labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Ensure the lengths match after splitting
y_train_encoded = y_train_encoded[:len(x_train)]
y_test_encoded = y_test_encoded[:len(x_test)]

In [37]:
# TF-IDF Vectorization
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(x_train)
tfidf_test_features = tfidf_vectorizer.transform(x_test)

In [38]:
# Bag of Words (BoW) features
count_vectorizer, count_train_features = tfidf_extractor(x_train, ngram_range=(1, 1))  # Unigram features
count_test_features = count_vectorizer.transform(x_test)

In [39]:
# Combine TF-IDF and BoW features
combined_train_features = hstack([tfidf_train_features, count_train_features])
combined_test_features = hstack([tfidf_test_features, count_test_features])

### Metode TF-IDF

Naiye Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

In [41]:
# Define function to train and evaluate Naive Bayes classifier
def train_and_evaluate(classifier, train_features, train_labels, test_features, test_labels):
    # Train the classifier
    classifier.fit(train_features, train_labels)
    
    # Predictions
    predictions = classifier.predict(test_features)
    
    # Calculate accuracy
    accuracy = accuracy_score(test_labels, predictions)
    
    # Calculate F1 score
    f1 = f1_score(test_labels, predictions, average='weighted')
    
    return accuracy, f1

In [42]:
model_naiye_parameter_td_idf = MultinomialNB(alpha= 0.1,fit_prior=False)
# Train and evaluate on TF-IDF Vectorization
tfidf_accuracy, tfidf_f1 = train_and_evaluate(model_naiye_parameter_td_idf , tfidf_train_features, y_train_encoded, tfidf_test_features, y_test_encoded)
# Print results
print("TF-IDF Vectorization Accuracy: {:.2f}%".format(tfidf_accuracy * 100))
print("TF-IDF Vectorization F1 Score: {:.2f}%".format(tfidf_f1 * 100))

TF-IDF Vectorization Accuracy: 64.44%
TF-IDF Vectorization F1 Score: 64.03%


SVM

In [43]:
from sklearn.svm import SVC

In [44]:
# Initialize SVM
model_svm_parameter_tf_id =  SVC(C=10, gamma=0.1, kernel='sigmoid')

# Train and evaluate on TF-IDF Vectorization
tfidf_accuracy, tfidf_f1 = train_and_evaluate(model_svm_parameter_tf_id , tfidf_train_features, y_train_encoded, tfidf_test_features, y_test_encoded)
# Print results
print("TF-IDF Vectorization Accuracy: {:.2f}%".format(tfidf_accuracy * 100))
print("TF-IDF Vectorization F1 Score: {:.2f}%".format(tfidf_f1 * 100))

TF-IDF Vectorization Accuracy: 72.67%
TF-IDF Vectorization F1 Score: 72.76%


In [45]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score, f1_score

# # Define the parameter grid
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': [1, 0.1, 0.01, 0.001],
#     'kernel': ['rbf', 'linear', 'sigmoid']
# }

# # Initialize the SVM model
# svm_model = SVC()

# # Set up the grid search with cross-validation
# grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')

# # Fit the model to the training data
# grid_search.fit(tfidf_train_features, y_train_encoded)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Train the best model on the entire training data
# best_model = grid_search.best_estimator_

# # Evaluate the best model on the test data
# y_pred = best_model.predict(tfidf_test_features)
# tfidf_accuracy = accuracy_score(y_test_encoded, y_pred)
# tfidf_f1 = f1_score(y_test_encoded, y_pred, average='weighted')

# # Print results
# print("TF-IDF Vectorization Accuracy: {:.2f}%".format(tfidf_accuracy * 100))
# print("TF-IDF Vectorization F1 Score: {:.2f}%".format(tfidf_f1 * 100))


Linear Regission

In [46]:
from sklearn.linear_model import LogisticRegression


In [49]:
# Initialize Naive Bayes classifier
model_lr_tanpa_parameter_tf_idf = LogisticRegression(random_state=42,max_iter=1000)

# Train and evaluate on TF-IDF Vectorization
tfidf_accuracy, tfidf_f1 = train_and_evaluate(model_lr_tanpa_parameter_tf_idf , tfidf_train_features, y_train_encoded, tfidf_test_features, y_test_encoded)
# Print results
print("TF-IDF Vectorization Accuracy: {:.2f}%".format(tfidf_accuracy * 100))
print("TF-IDF Vectorization F1 Score: {:.2f}%".format(tfidf_f1 * 100))

TF-IDF Vectorization Accuracy: 70.89%
TF-IDF Vectorization F1 Score: 70.95%


Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
# Initialize SVM
model_rfc_parameter =  RandomForestClassifier(min_samples_split=10,min_samples_leaf=2,n_estimators=150, random_state=42)

# Train and evaluate on TF-IDF Vectorization
tfidf_accuracy, tfidf_f1 = train_and_evaluate(model_rfc_parameter , tfidf_train_features, y_train_encoded, tfidf_test_features, y_test_encoded)

# Print results
print("TF-IDF Vectorization Accuracy: {:.2f}%".format(tfidf_accuracy * 100))
print("TF-IDF Vectorization F1 Score: {:.2f}%".format(tfidf_f1 * 100))


TF-IDF Vectorization Accuracy: 67.11%
TF-IDF Vectorization F1 Score: 67.11%


### Metode BoW

Naiye Bayes

In [52]:
# Initialize Naive Bayes classifier
model_naiye_parameter_bow = MultinomialNB(alpha= 0.1,fit_prior=True)

# Train and evaluate on Bag of Words (BoW) features
bow_accuracy, bow_f1 = train_and_evaluate(model_naiye_parameter_bow , count_train_features, y_train_encoded, count_test_features, y_test_encoded)

print("Bag of Words (BoW) Accuracy: {:.2f}%".format(bow_accuracy * 100))
print("Bag of Words (BoW) F1 Score: {:.2f}%".format(bow_f1 * 100))

Bag of Words (BoW) Accuracy: 61.33%
Bag of Words (BoW) F1 Score: 60.81%


SVM

In [53]:
# Initialize SVM
model_svm_parameter_bow =  SVC(C=10, gamma=0.1, kernel='sigmoid')


# Train and evaluate on Bag of Words (BoW) features
bow_accuracy, bow_f1 = train_and_evaluate(model_svm_parameter_bow , count_train_features, y_train_encoded, count_test_features, y_test_encoded)



print("Bag of Words (BoW) Accuracy: {:.2f}%".format(bow_accuracy * 100))
print("Bag of Words (BoW) F1 Score: {:.2f}%".format(bow_f1 * 100))

Bag of Words (BoW) Accuracy: 71.78%
Bag of Words (BoW) F1 Score: 71.83%


Linear Regission

In [54]:
# Initialize Naive Bayes classifier
model_lr_parameter_bow = LogisticRegression(C=1,penalty="l2",random_state=42,max_iter=1000)



# Train and evaluate on Bag of Words (BoW) features
bow_accuracy, bow_f1 = train_and_evaluate(model_lr_parameter_bow , count_train_features, y_train_encoded, count_test_features, y_test_encoded)

print("Bag of Words (BoW) Accuracy: {:.2f}%".format(bow_accuracy * 100))
print("Bag of Words (BoW) F1 Score: {:.2f}%".format(bow_f1 * 100))

Bag of Words (BoW) Accuracy: 71.33%
Bag of Words (BoW) F1 Score: 71.36%


Random Forest

In [55]:
# Initialize SVM
model_rfc_parameter_bow =  RandomForestClassifier(min_samples_split=10,min_samples_leaf=2,n_estimators=150, random_state=42)

# Train and evaluate on Bag of Words (BoW) features
bow_accuracy, bow_f1 = train_and_evaluate(model_rfc_parameter_bow , count_train_features, y_train_encoded, count_test_features, y_test_encoded)

print("Bag of Words (BoW) Accuracy: {:.2f}%".format(bow_accuracy * 100))
print("Bag of Words (BoW) F1 Score: {:.2f}%".format(bow_f1 * 100))


Bag of Words (BoW) Accuracy: 69.11%
Bag of Words (BoW) F1 Score: 69.14%


### Metode Combined

Naiye Bayes

In [58]:
# Initialize Naive Bayes classifier
model_naiye_parameter_combined = MultinomialNB(alpha= 0.1,fit_prior=True)



# Train and evaluate on Combined TF-IDF and BoW features
combined_accuracy, combined_f1 = train_and_evaluate(model_naiye_parameter_combined , combined_train_features, y_train_encoded, combined_test_features, y_test_encoded)

print("Combined TF-IDF and BoW Accuracy: {:.2f}%".format(combined_accuracy * 100))
print("Combined TF-IDF and BoW F1 Score: {:.2f}%".format(combined_f1 * 100))

Combined TF-IDF and BoW Accuracy: 63.78%
Combined TF-IDF and BoW F1 Score: 63.11%


SVM

In [59]:
# Initialize SVM
model_svm_parameter_combined =  SVC(C=10, gamma=0.1, kernel='sigmoid')



# Train and evaluate on Combined TF-IDF and BoW features
combined_accuracy, combined_f1 = train_and_evaluate(model_svm_parameter_combined , combined_train_features, y_train_encoded, combined_test_features, y_test_encoded)


print("Combined TF-IDF and BoW Accuracy: {:.2f}%".format(combined_accuracy * 100))
print("Combined TF-IDF and BoW F1 Score: {:.2f}%".format(combined_f1 * 100))

Combined TF-IDF and BoW Accuracy: 73.11%
Combined TF-IDF and BoW F1 Score: 73.15%


Linear Regission

Random Forest