In [1]:
import numpy as np
import pandas as pd
import os
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erinmcisaac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Assumes dataset has been extracted and available in this path
base_path = '/Users/erinmcisaac/Desktop/STEM/COSC_A406/McIsaac_SentimentClassification/aclImdb'
labels = {'pos': 1, 'neg': 0}
data = []

for label in labels:
    folder = os.path.join(base_path, 'train', label)
    for file in os.listdir(folder):
        with open(os.path.join(folder, file), 'r', encoding='utf-8') as f:
            data.append([f.read(), labels[label]])

df = pd.DataFrame(data, columns=['review', 'sentiment'])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.to_csv('movie_data.csv', index=False)


In [3]:
def preprocess_text(text):
    text = re.sub('<[^>]*>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation and lowercase
    return text


In [4]:
porter = PorterStemmer()
stop = stopwords.words('english')

def tokenizer(text):
    return [porter.stem(word) for word in text.split() if word not in stop]


In [5]:
df = pd.read_csv('movie_data.csv')
X = df['review'].apply(preprocess_text)
y = df['sentiment'].values

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer,
                        ngram_range=(1,1),
                        stop_words=None)

X_tfidf = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=1, stratify=y)

clf = LogisticRegression(C=10.0, random_state=1, max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')




Accuracy: 0.8950


In [6]:
os.makedirs('pkl_objects', exist_ok=True)

pickle.dump(clf, open(os.path.join('pkl_objects', 'classifier.pkl'), 'wb'))
pickle.dump(tfidf, open(os.path.join('pkl_objects', 'tfidf.pkl'), 'wb'))
pickle.dump(stop, open(os.path.join('pkl_objects', 'stopwords.pkl'), 'wb'))
pickle.dump(porter, open(os.path.join('pkl_objects', 'porter.pkl'), 'wb'))


In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop = stopwords.words('english')
porter = PorterStemmer()

def preprocess_text(text):
    text = re.sub('<[^>]*>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation, lowercase
    return text

def tokenizer(text):
    return [porter.stem(word) for word in text.split() if word not in stop]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data
df = pd.read_csv('movie_data.csv')
X = df['review'].apply(preprocess_text)
y = df['sentiment'].values

# Vectorize
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer,
                        ngram_range=(1, 1),
                        stop_words=None)

X_tfidf = tfidf.fit_transform(X)

# Split & Train
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=1, stratify=y)

clf = LogisticRegression(C=10.0, random_state=1, max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')




Accuracy: 0.8950


In [9]:
import pickle
import os

os.makedirs('pkl_objects', exist_ok=True)

pickle.dump(clf, open(os.path.join('pkl_objects', 'classifier.pkl'), 'wb'))
pickle.dump(tfidf, open(os.path.join('pkl_objects', 'tfidf.pkl'), 'wb'))
pickle.dump(stop, open(os.path.join('pkl_objects', 'stopwords.pkl'), 'wb'))
pickle.dump(porter, open(os.path.join('pkl_objects', 'porter.pkl'), 'wb'))
