<a href="https://colab.research.google.com/github/fionatjahjono/ml_project/blob/main/sentiment_analysis_ps133.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Bagian 1: Import Libraries dan Load Dataset
!pip install emoji
!pip install Sastrawi

from flask import Flask, request, jsonify
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')
nltk.download('vader_lexicon')

In [None]:
data = pd.read_csv('/content/sentiment_analysis.csv')
print("Shape of the dataset:")
print(data.shape)
print(data.dtypes)
print(data.head(2800))

In [None]:
ax = data['rating'].value_counts().sort_index().plot(kind='bar',
          title='Count of Reviews by rating',
          figsize=(6, 3))
ax.set_xlabel('Review rating')

In [None]:
#Preprocessing dan Feature Extraction
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[?|$|.|!_:")(-+,]', '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r"\b[a-zA-Z]\b", "", text)
        text = re.sub('\s+',' ', text)
        text = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('indonesian'))
        text = [word for word in text if word.lower() not in stop_words]
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        text = [stemmer.stem(w) for w in text]
        return ' '.join(text)
    else:
        return ''

In [None]:
data['processed_text'] = data['text'].apply(preprocess_text)

In [None]:
sid = SentimentIntensityAnalyzer()
data['vader_sentiment'] = data['processed_text'].apply(lambda x: 'positive' if sid.polarity_scores(x)['compound'] >= 0.05 else ('negative' if sid.polarity_scores(x)['compound'] <= -0.05 else 'neutral'))
# Feature Extraction menggunakan CountVectorizer
vocab = CountVectorizer().fit(data['processed_text'])
x = vocab.transform(data['processed_text'])

In [None]:
#Pembagian Data dan Model Training
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x, data['rating'], data['vader_sentiment'], test_size=0.2, random_state=100)


In [None]:
# Parameter tuning for Multinomial Naive Bayes
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
best_mnb = grid_search.best_estimator_

In [None]:
# Use TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer().fit(data['processed_text'])
x_tfidf = tfidf_vectorizer.transform(data['processed_text'])
x_train_tfidf, x_test_tfidf, _, _, _, _ = train_test_split(x_tfidf, data['rating'], data['vader_sentiment'], test_size=0.2, random_state=100)


In [None]:
# Retrain the model with TF-IDF vectorization
best_mnb_tfidf = grid_search.best_estimator_
best_mnb_tfidf.fit(x_train_tfidf, y_train)
pred_mnb_tfidf = best_mnb_tfidf.predict(x_test_tfidf)

In [None]:
# Evaluate the model
score_mnb_tfidf = round(accuracy_score(y_test, pred_mnb_tfidf) * 100, 2)
print("Confusion Matrix for Multinomial Naive Bayes (TF-IDF):")
print(confusion_matrix(y_test, pred_mnb_tfidf))
print("Score (TF-IDF):", score_mnb_tfidf)
print("Classification Report (TF-IDF):")
print(classification_report(y_test, pred_mnb_tfidf))

In [None]:
def predict_sentiment(review):
    processed_review = preprocess_text(review)
    review_vectorized = vocab.transform([processed_review])
    sentiment_score = sid.polarity_scores(processed_review)['compound']
    rating_prediction = best_mnb.predict(review_vectorized)[0]
    if sentiment_score <= -0.05:
        predicted_sentiment = 'negative'
    elif sentiment_score >= 0.05:
        predicted_sentiment = 'positive'
    else:
        predicted_sentiment = 'neutral'
    return rating_prediction, predicted_sentiment

In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    review = request.json['review']
    rating_prediction, sentiment_prediction = predict_sentiment(review)
    return jsonify({'rating': rating_prediction, 'sentiment': sentiment_prediction})

if __name__ == '__main__':
    app.run(debug=True)

In [None]:
# Model Before Flask
def predict_sentiment(review):
    processed_review = preprocess_text(review)
    review_vectorized = vocab.transform([processed_review])
    sentiment_score = sid.polarity_scores(processed_review)['compound']
    rating_prediction = best_mnb_tfidf.predict(review_vectorized)[0]
    if sentiment_score <= -0.05:
        predicted_sentiment = 'negative'
    elif sentiment_score >= 0.05:
        predicted_sentiment = 'positive'
    else:
        predicted_sentiment = 'neutral'
    return rating_prediction, predicted_sentiment

In [None]:
pr = data['text'][120]
print(pr)
print("Actual Rating:", data['rating'][2999])
rating_pred, sentiment_pred = predict_sentiment(pr)
print("Predicted Rating:", rating_pred)
print("Predicted Sentiment:", sentiment_pred)
