In [69]:
#Task: Movie Review Sentiment Analysis (Beginners & Experienced Interns)
#Goal: Build a model that classifies movie reviews as positive or negative based on sentiment analysis.
#Requirements:
#● Preprocess text data (lowercasing, removing stopwords, tokenization).
#● Train a machine learning model (Logistic Regression, Naïve Bayes, or SVM).
#● Evaluate the model using accuracy and F1-score.
#● (Optional) Create a simple interface where users can enter a review, and the model predicts the sentiment.
#Dataset:
#● IMDb Movie Reviews Dataset
#Expected Outcome:
#A basic sentiment analysis system that classifies movie reviews as positive or negative.

In [70]:

# Import necessary libraries
import nltk
import random
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score


In [71]:
# Download movie reviews dataset
nltk.download('movie_reviews')

# Load movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [72]:

# Shuffle dataset to avoid order bias
random.shuffle(documents)


In [73]:
# Preprocessing: Extract features and labels
reviews = [" ".join(words) for words, label in documents]  # Join words into sentences
labels = [label for words, label in documents]  # Extract sentiment labels (pos/neg)


In [74]:

# Convert labels to numeric format (pos -> 1, neg -> 0)
labels = [1 if label == 'pos' else 0 for label in labels]

In [75]:

# Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=5000)
X = vectorizer.fit_transform(reviews)


In [76]:
# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train different models and compare performance
models = {
    "Naïve Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(kernel='linear')
}

In [77]:
# Evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_test)  # Predict on test set
    acc = accuracy_score(y_test, y_pred)  # Calculate accuracy
    f1 = f1_score(y_test, y_pred)  # Calculate F1-score
    
    print(f"{model_name} -> Accuracy: {acc:.2f}, F1-score: {f1:.2f}")


Naïve Bayes -> Accuracy: 0.82, F1-score: 0.81
Logistic Regression -> Accuracy: 0.85, F1-score: 0.85
SVM -> Accuracy: 0.84, F1-score: 0.84


In [78]:
# Optional: Simple User Interface for Review Prediction
def predict_review_sentiment(review):
    """
    Function to predict the sentiment of a given movie review.
    """
    review_tfidf = vectorizer.transform([review])  # Convert text to numerical format
    prediction = models["Logistic Regression"].predict(review_tfidf)  # Predict using the best model
    return "Positive" if prediction[0] == 1 else "Negative"

In [79]:
# Example Usage:
user_review = "The movie was fantastic, full of suspense and great acting!"
print(f"User Review Sentiment: {predict_review_sentiment(user_review)}")

User Review Sentiment: Positive
