In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the Amazon Customer Reviews (US) dataset
# You can download the dataset from https://registry.opendata.aws/amazon-reviews/
# Ensure to replace 'path/to/amazon_reviews_us_Electronics_v1_00.tsv.gz' with the actual path to the dataset
df = pd.read_csv('path/to/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False)

# Explore the dataset
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Subsample the data for faster execution (you can use the full dataset for more accuracy)
df = df.sample(frac=0.1, random_state=42)

# Select relevant columns for sentiment analysis
df = df[['star_rating', 'review_body']]

# Map star ratings to sentiments (positive: 4-5 stars, negative: 1-3 stars)
df['sentiment'] = df['star_rating'].apply(lambda x: 1 if x > 3 else 0)

# Feature selection and split the data
X = df['review_body'].values
y = df['sentiment'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization for text reviews
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_report_str}")

# Visualize the results
plt.figure(figsize=(6, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
