In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack

In [18]:
def tfidf_extractor(corpus, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [19]:
def prepare_datasets(corpus, labels, test_data_proportion=0.3, random_state=42):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                        test_size=test_data_proportion,
                                                        random_state=random_state)
    return train_X, test_X, train_Y, test_Y

In [20]:
df = pd.read_csv('sentiment_pemilu_otomatis_1500.csv')
df.head(10)

Unnamed: 0,content,sentimen
0,presiden joko widodo jokowi korupsi kejahatan ...,negatif
1,wali kota depok mohammad idris menanggapi nyin...,negatif
2,komika lampung aulia rakhman viral media sosia...,negatif
3,unggahan berisi daftar afiliasi politik pegawa...,negatif
4,menko polhukam mahfud md merespons pernyataan ...,negatif
5,calon wakil presiden cawapres nomor urut 1 muh...,negatif
6,capres ganjar pranowo sikap presiden joko wido...,negatif
7,menteri investasi bahlil lahadalia bicara isu ...,negatif
8,massa buruh buka suara memblokade tol cipulara...,negatif
9,raut kecewa terpancar wajah ridwan nasution 43...,negatif


In [21]:
# Split data
x_train, x_test, y_train, y_test = prepare_datasets(df['content'], df['sentimen'], test_data_proportion=0.3)

In [22]:
# Convert text labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Ensure the lengths match after splitting
y_train_encoded = y_train_encoded[:len(x_train)]
y_test_encoded = y_test_encoded[:len(x_test)]

In [23]:
# TF-IDF Vectorization
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(x_train)
tfidf_test_features = tfidf_vectorizer.transform(x_test)

In [24]:
# Bag of Words (BoW) features
count_vectorizer, count_train_features = tfidf_extractor(x_train, ngram_range=(1, 1))  # Unigram features
count_test_features = count_vectorizer.transform(x_test)

In [25]:
# Combine TF-IDF and BoW features
combined_train_features = hstack([tfidf_train_features, count_train_features])
combined_test_features = hstack([tfidf_test_features, count_test_features])

In [26]:
from sklearn.svm import SVC

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

In [28]:
# Define function to train and evaluate Naive Bayes classifier
def train_and_evaluate(classifier, train_features, train_labels, test_features, test_labels):
    # Train the classifier
    classifier.fit(train_features, train_labels)
    
    # Predictions
    predictions = classifier.predict(test_features)
    
    # Calculate accuracy
    accuracy = accuracy_score(test_labels, predictions)
    
    # Calculate F1 score
    f1 = f1_score(test_labels, predictions, average='weighted')
    
    return accuracy, f1

In [30]:
# Initialize SVM
model_svm_parameter =  SVC(C=10, gamma=0.1, kernel='sigmoid')



# Train and evaluate on Combined TF-IDF and BoW features
combined_accuracy, combined_f1 = train_and_evaluate(model_svm_parameter , combined_train_features, y_train_encoded, combined_test_features, y_test_encoded)


print("Combined TF-IDF and BoW Accuracy: {:.2f}%".format(combined_accuracy * 100))
print("Combined TF-IDF and BoW F1 Score: {:.2f}%".format(combined_f1 * 100))

Combined TF-IDF and BoW Accuracy: 90.89%
Combined TF-IDF and BoW F1 Score: 90.86%


In [31]:
# import joblib
# # Dictionary to store model and vectorizer
# model_data = {
#     'model': model_svm_parameter,  # your trained Logistic Regression model
#     'tfidf_vectorizer': tfidf_vectorizer,
#     'count_vectorizer': count_vectorizer,
#     'label_encoder': label_encoder
# }

# # Save the model data to a file
# joblib.dump(model_data, 'model_sentimen_svm.pkl')

['model_sentimen_svm.pkl']

In [32]:
import joblib

# Load the saved model data
loaded_model_data = joblib.load('model_sentimen_svm.pkl')

# Extract the components
lr_model = loaded_model_data['model']
tfidf_vectorizer = loaded_model_data['tfidf_vectorizer']
count_vectorizer = loaded_model_data['count_vectorizer']
label_encoder = loaded_model_data['label_encoder']


In [34]:
def preprocess_new_data(new_data):
    # Transform the new data using the loaded vectorizers
    tfidf_features = tfidf_vectorizer.transform(new_data)
    count_features = count_vectorizer.transform(new_data)
    
    # Combine the features
    combined_features = hstack([tfidf_features, count_features])
    
    return combined_features

def predict_sentiment(new_data):
    # Preprocess the new data
    combined_features = preprocess_new_data(new_data)
    
    # Make predictions
    predictions = lr_model.predict(combined_features)
    
    # Convert numerical labels back to original text labels
    text_predictions = label_encoder.inverse_transform(predictions)
    
    return text_predictions

# Example new data
new_texts = ["Sedang uji coba kritik"]

# Get predictions
predictions = predict_sentiment(new_texts)

# Print predictions
for text, prediction in zip(new_texts, predictions):
    print(f"Text: {text} -> Sentiment: {prediction}")


Text: Sedang uji coba kritik -> Sentiment: negatif
