In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string

# Download NLTK stopwords and tokenizer
nltk.download('stopwords')
nltk.download('punkt')

# File paths
train_file_path = "/Users/dawrynrosario/desktop/IST664/final_project/nlp_project/kagglemoviereviews/corpus/train.tsv"
test_file_path = "/Users/dawrynrosario/desktop/IST664/final_project/nlp_project/kagglemoviereviews/corpus/test.tsv"

# Step 1: Read in the training and test data
train_data = pd.read_csv(train_file_path, sep='\t')
test_data = pd.read_csv(test_file_path, sep='\t')

# Step 2: Preprocessing - Remove stopwords, tokenize, and clean text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

train_data['ProcessedPhrase'] = train_data['Phrase'].apply(preprocess_text)
test_data['ProcessedPhrase'] = test_data['Phrase'].apply(preprocess_text)

# Step 3: Split the training data
X_train = train_data['ProcessedPhrase']  # Use preprocessed text
y_train = train_data['Sentiment']

X_test = test_data['ProcessedPhrase']  # Use preprocessed text

# Split training data into train and validation sets for evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 4: Experiment with Bag-of-Words and TF-IDF Vectorizers (including bigram features)
# 1. Bag-of-Words with bigram and unigram features
bow_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Includes unigrams and bigrams
X_train_bow = bow_vectorizer.fit_transform(X_train_split)
X_val_bow = bow_vectorizer.transform(X_val)
X_test_bow = bow_vectorizer.transform(X_test)

# 2. TF-IDF with bigram and unigram features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Includes unigrams and bigrams
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_split)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 5: Train and Evaluate Models
# Logistic Regression with Bag-of-Words
lr_bow_model = LogisticRegression(max_iter=1000)
lr_bow_model.fit(X_train_bow, y_train_split)
val_pred_lr_bow = lr_bow_model.predict(X_val_bow)
print("\nLogistic Regression with BOW Classification Report:")
print(classification_report(y_val, val_pred_lr_bow))
test_pred_lr_bow = lr_bow_model.predict(X_test_bow)

# Logistic Regression with TF-IDF
lr_tfidf_model = LogisticRegression(max_iter=1000)
lr_tfidf_model.fit(X_train_tfidf, y_train_split)
val_pred_lr_tfidf = lr_tfidf_model.predict(X_val_tfidf)
print("\nLogistic Regression with TF-IDF Classification Report:")
print(classification_report(y_val, val_pred_lr_tfidf))
test_pred_lr_tfidf = lr_tfidf_model.predict(X_test_tfidf)

# Naive Bayes with Bag-of-Words
nb_bow_model = MultinomialNB()
nb_bow_model.fit(X_train_bow, y_train_split)
val_pred_nb_bow = nb_bow_model.predict(X_val_bow)
print("\nNaive Bayes with BOW Classification Report:")
print(classification_report(y_val, val_pred_nb_bow))
test_pred_nb_bow = nb_bow_model.predict(X_test_bow)

# Naive Bayes with TF-IDF
nb_tfidf_model = MultinomialNB()
nb_tfidf_model.fit(X_train_tfidf, y_train_split)
val_pred_nb_tfidf = nb_tfidf_model.predict(X_val_tfidf)
print("\nNaive Bayes with TF-IDF Classification Report:")
print(classification_report(y_val, val_pred_nb_tfidf))
test_pred_nb_tfidf = nb_tfidf_model.predict(X_test_tfidf)

# Step 6: Save Predictions for Each Model
test_data['LR_BOW_Sentiment'] = test_pred_lr_bow
test_data['LR_TFIDF_Sentiment'] = test_pred_lr_tfidf
test_data['NB_BOW_Sentiment'] = test_pred_nb_bow
test_data['NB_TFIDF_Sentiment'] = test_pred_nb_tfidf


# Save the predictions to CSV
output_file_path = "/Users/dawrynrosario/desktop/IST664/final_project/nlp_project/kagglemoviereviews/corpus/test_predictions_with_bigrams.csv"
test_data.to_csv(output_file_path, index=False)
print(f"Predictions saved to {output_file_path}")