In [3]:
import os
import requests
import pandas as pd
import numpy as np
import tarfile
from io import BytesIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Professional\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
DATASET_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

In [5]:
def download_data():
    response = requests.get(DATASET_URL)
    with tarfile.open(fileobj=BytesIO(response.content), mode="r:gz") as tar:
        tar.extractall("aclImdb")

In [6]:
def load_data():
    reviews = []
    sentiments = []

    for sentiment in ['pos', 'neg']:
        path = 'aclImdb/train/' + sentiment
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                reviews.append(file.read())
                sentiments.append(sentiment)

    df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})
    return df

In [7]:
def preprocess_data(df):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    df['review'] = df['review'].str.lower().str.replace('<[^<]+?>', '').str.replace('[^\w\s]', '')
    df['review'] = df['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop_words]))
    return df

In [8]:
def main():
    download_data()
    df = load_data()
    df = preprocess_data(df)

    # Vectorize the text data
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['review'])
    y = df['sentiment']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Naive Bayes classifier
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    # Evaluate the model
    y_pred = clf.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [9]:
if __name__ == '__main__':
    main()

  df['review'] = df['review'].str.lower().str.replace('<[^<]+?>', '').str.replace('[^\w\s]', '')


Accuracy: 0.8482
Classification Report:
              precision    recall  f1-score   support

         neg       0.83      0.87      0.85      2485
         pos       0.87      0.83      0.85      2515

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

