In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load your IMDb dataset
df = pd.read_csv('imdb_reviews.csv')

# Assuming your CSV file has 'review' and 'sentiment' columns
X = df['Text']
y = df['Label']

In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Function to map POS tag to wordnet format for lemmatization
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # POS tagging and lemmatization
    lemmatizer = WordNetLemmatizer()
    pos_tagged = pos_tag(words)
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag) or wordnet.NOUN) for word, tag in pos_tagged]

    # Join words back into a single string
    return ' '.join(words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
# Apply text preprocessing to the entire dataset before splitting
X = X.apply(preprocess_text)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Create a pipeline with TfidfVectorizer and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Train the model
pipeline.fit(X_train, y_train)

In [6]:
# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.8876
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      5034
           1       0.87      0.90      0.89      4966

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [9]:
import joblib
model_filename = 'Logistic-regression.joblib'
joblib.dump(pipeline, model_filename)

['Logistic-regression.joblib']

In [10]:
# Function to make a single prediction
def predict_sentiment(review):
    # Apply text preprocessing to the new review
    processed_review = preprocess_text(review)
    # Predict sentiment using the trained pipeline
    return pipeline.predict([processed_review])[0]

In [15]:
# Example usage of the single prediction function
single_review = "This movie was absolutely good!"
prediction = predict_sentiment(single_review)
print(f'Predicted Sentiment: {prediction}')

Predicted Sentiment: 1
