In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [22]:
df = pd.read_csv("data.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [23]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
from nltk.tokenize import word_tokenize
df['review'] = [word_tokenize(review) for review in df['review']]

In [25]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(token_list):
    """Removes stop words from a list of tokens."""
    # List comprehension for fast filtering
    return [token for token in token_list if token.lower() not in stop_words]

df['review'] =  df['review'].apply(remove_stopwords)

In [27]:
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet') 
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer() 

def lemmatize_tokens(token_list):
    return [lemmatizer.lemmatize(token) for token in token_list]

df['review'] =  df['review'].apply(lemmatize_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\danad\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer 

# STEP 1: Join the lists of tokens back into single strings
# Example: ['movie', 'good'] -> "movie good"
df['review_string'] = df['review'].apply(lambda x: " ".join(x))

# STEP 2: Initialize the Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Added max_features for performance

# Now, fit and transform the *entire* column (X) in one go
# This creates the single M x N feature matrix needed by LogisticRegression.
X = tfidf_vectorizer.fit_transform(df['review_string']) 

# Y remains the sentiment column
y = df['sentiment'] 

In [29]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8888

Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.87      0.89      4961
    positive       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [38]:
import pickle
filename = 'logistic_regression_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [36]:
text = "I hated this movie because of how good it is. I enjoyed the scenery so much!"


def process_input(text):
    processed_text = word_tokenize(text)
    processed_text = [lemmatizer.lemmatize(token) for token in processed_text]
    single_document = " ".join(processed_text)
    document_list = [single_document]
    X_features = tfidf_vectorizer.transform(document_list)
    print(model.predict(X_features))
    
process_input(text)
    

['positive']
