In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def prepare_datasets(corpus, labels, test_data_proportion=0.3, random_state=42):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                        test_size=test_data_proportion,
                                                        random_state=random_state)
    return train_X, test_X, train_Y, test_Y
    
def tfidf_extractor(corpus, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

def bow_extractor(corpus, ngram_range=(1,1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

df = pd.read_csv('sentiment_otomatis_3000.csv')

# Split data
x_train, x_test, y_train, y_test = prepare_datasets(df['content'], df['sentimen'], test_data_proportion=0.3)

# Convert text labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Ensure the lengths match after splitting
y_train_encoded = y_train_encoded[:len(x_train)]
y_test_encoded = y_test_encoded[:len(x_test)]

# TF-IDF Vectorization
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(x_train)
tfidf_test_features = tfidf_vectorizer.transform(x_test)

# Bag of Words (BoW) features
count_vectorizer, count_train_features = bow_extractor(x_train)
count_test_features = count_vectorizer.transform(x_test)

# Combine TF-IDF and BoW features
combined_train_features = hstack([tfidf_train_features, count_train_features])
combined_test_features = hstack([tfidf_test_features, count_test_features])

# Define function to train and evaluate Logistic Regression classifier
def train_and_evaluate(classifier, train_features, train_labels, test_features, test_labels):
    # Train the classifier
    classifier.fit(train_features, train_labels)
    
    # Predictions
    predictions = classifier.predict(test_features)
    
    # Calculate accuracy
    accuracy = accuracy_score(test_labels, predictions)
    
    # Calculate F1 score
    f1 = f1_score(test_labels, predictions, average='weighted')
    
    # Classification report
    report = classification_report(test_labels, predictions, target_names=label_encoder.classes_)
    
    # Confusion matrix
    matrix = confusion_matrix(test_labels, predictions)
    
    return accuracy * 100, f1 * 100, report, matrix

# Train and evaluate on combined features
lr_model = LogisticRegression(random_state=42, max_iter=1000)
combined_accuracy, combined_f1, combined_report, combined_matrix = train_and_evaluate(lr_model, combined_train_features, y_train_encoded, combined_test_features, y_test_encoded)

print("Combined TF-IDF and BoW Features:")
print("Accuracy: {:.2f}%".format(combined_accuracy))
print("F1 Score: {:.2f}%".format(combined_f1))
print("Classification Report:\n", combined_report)
print("Confusion Matrix:\n", combined_matrix)




Combined TF-IDF and BoW Features:
Accuracy: 92.11%
F1 Score: 92.09%
Classification Report:
               precision    recall  f1-score   support

     negatif       0.93      0.89      0.91       313
      netral       0.89      0.98      0.93       291
     positif       0.96      0.90      0.92       296

    accuracy                           0.92       900
   macro avg       0.92      0.92      0.92       900
weighted avg       0.92      0.92      0.92       900

Confusion Matrix:
 [[279  22  12]
 [  6 285   0]
 [ 16  15 265]]


In [2]:
# Function to predict sentiment probabilities for new text
def predict_sentiment(text):
    # Transform the text using both vectorizers
    tfidf_features = tfidf_vectorizer.transform([text])
    bow_features = count_vectorizer.transform([text])
    
    # Combine features
    combined_features = hstack([tfidf_features, bow_features])
    
    # Predict probabilities
    probabilities = lr_model.predict_proba(combined_features)[0]
    
    # Convert probabilities to percentages and map to sentiment labels
    sentiment_probs = {label: f"{prob * 100:.2f}%" for label, prob in zip(label_encoder.classes_, probabilities)}
    
    return sentiment_probs

# Example usage
new_text = "Dalam Kejadian pembunuhan pada hari ini banyak sekali bukti dan korban yang membuat kesedihan dari banyak orang"
sentiment_probabilities = predict_sentiment(new_text)
print("Sentiment Probabilities (in percentages):", sentiment_probabilities)


Sentiment Probabilities (in percentages): {'negatif': '15.90%', 'netral': '82.89%', 'positif': '1.20%'}


In [None]:
import joblib

# Dictionary to store model and vectorizer
model_data = {
    'model': lr_model,  # your trained Logistic Regression model
    'tfidf_vectorizer': tfidf_vectorizer,
    'count_vectorizer': count_vectorizer,
    'label_encoder': label_encoder
}

# Save the model data to a file
joblib.dump(model_data, 'model_sentimen_lr.pkl')

# Function to predict sentiment probabilities for new text using the saved model
def predict_sentiment(text, model_path='model_sentimen_lr.pkl'):
    # Load the model data
    loaded_model_data = joblib.load(model_path)
    model = loaded_model_data['model']
    tfidf_vectorizer = loaded_model_data['tfidf_vectorizer']
    count_vectorizer = loaded_model_data['count_vectorizer']
    label_encoder = loaded_model_data['label_encoder']
    
    # Transform the text using both vectorizers
    tfidf_features = tfidf_vectorizer.transform([text])
    bow_features = count_vectorizer.transform([text])
    
    # Combine features
    combined_features = hstack([tfidf_features, bow_features])
    
    # Predict probabilities
    probabilities = model.predict_proba(combined_features)[0]
    
    # Convert probabilities to percentages and map to sentiment labels
    sentiment_probs = {label: f"{prob * 100:.2f}%" for label, prob in zip(label_encoder.classes_, probabilities)}
    
    return sentiment_probs

# Example usage
new_text = "Dalam Kejadian pembunuhan pada hari ini banyak sekali bukti dan korban yang membuat kesedihan dari banyak orang"
sentiment_probabilities = predict_sentiment(new_text)
print("Sentiment Probabilities (in percentages):", sentiment_probabilities)


In [None]:
import joblib
from scipy.sparse import hstack

# Function to predict sentiment probabilities for new text data using the saved model
def predict_sentiment_probabilities(new_data, model_path='model_sentimen_lr.pkl'):
    # Load the model data
    loaded_model_data = joblib.load(model_path)
    loaded_model = loaded_model_data['model']
    loaded_tfidf_vectorizer = loaded_model_data['tfidf_vectorizer']
    loaded_count_vectorizer = loaded_model_data['count_vectorizer']
    loaded_label_encoder = loaded_model_data['label_encoder']
    
    # Transform the new data using both vectorizers
    tfidf_features = loaded_tfidf_vectorizer.transform(new_data)
    bow_features = loaded_count_vectorizer.transform(new_data)
    
    # Combine features
    combined_features = hstack([tfidf_features, bow_features])
    
    # Predict probabilities for each sentiment class using the loaded model
    probabilities = loaded_model.predict_proba(combined_features)
    
    # Print the predicted probabilities for each text
    for text, prob in zip(new_data, probabilities):
        sentiment_probs = {label: f"{p * 100:.2f}%" for label, p in zip(loaded_label_encoder.classes_, prob)}
        print(f"Text: '{text}' -> Sentiment Probabilities: {sentiment_probs}")

# Example usage
new_data = [
    "Dalam Kejadian pembunuhan pada hari ini banyak sekali bukti dan korban yang membuat kesedihan dari banyak orang",
    "Hari ini sangat menyenangkan dan penuh kegembiraan",
    "Saya merasa netral tentang kejadian ini"
]

predict_sentiment_probabilities(new_data)
