In [None]:
# Importing libraries
import nltk
from nltk.corpus import movie_reviews
import random

# Downloading the dataset 
nltk.download('movie_reviews')

# Loading the data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffling data for randomness
random.shuffle(documents)

# Display an example
print("Example review:", " ".join(documents[0][0]))
print("Label:", documents[0][1])


In [None]:

import nltk
nltk.download('punkt_tab')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Downloading required resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initializing lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Tokenize, lower, remove stopwords, and lemmatize
    words = word_tokenize(" ".join(text).lower())
    words = [lemmatizer.lemmatize(w) for w in words if w.isalpha() and w not in stop_words]
    return words

# Applying preprocessing
documents = [(preprocess_text(doc), label) for doc, label in documents]

# Show example of preprocessed review
print("Preprocessed Example:", documents[0][0])


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Converting documents to strings
corpus = [" ".join(doc) for doc, _ in documents]
labels = [1 if label == 'pos' else 0 for _, label in documents]  # Convert labels to binary

# Initializing TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(corpus).toarray()
y = labels


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Spliting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Evaluating Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print("Logistic Regression F1 Score:", f1_score(y_test, lr_pred))

# Training Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Evaluating SVM
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM F1 Score:", f1_score(y_test, svm_pred))


In [None]:
import joblib

# Saving the model
best_model = lr_model if f1_score(y_test, lr_pred) > f1_score(y_test, svm_pred) else svm_model
joblib.dump(best_model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved for deployment.")


In [None]:
import joblib

# Loading the model and vectorizer
model = joblib.load('sentiment_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Defining a function for prediction
def predict_sentiment(review):
    # Preprocessing and vectorizing the input
    review_vector = vectorizer.transform([review]).toarray()
    prediction = model.predict(review_vector)[0]
    return 'positive' if prediction == 1 else 'negative'

# Example usage
if __name__ == "__main__":
    review = input("Enter a movie review: ")
    sentiment = predict_sentiment(review)
    print(f"The sentiment is: {sentiment}")
