In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [2]:
# Set working directory
os.chdir('C:/Users/asus/Documents/GitHub/CMSC-197-Miniproject')

# Read JSON file
df = pd.read_json('data/amazon_data.json', orient='records')

# Assign X and y
X = df['Review_Text']
y = df['Label']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Training and test set
train_size = 0.8
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=None)

# Convert labels to binary
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Create logistic regression model
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.weights = np.zeros(self.n)
        self.bias = 0
        self.X = X
        self.y = y

        for _ in range(self.num_iterations):
            self.update_weights()

    def update_weights(self):
        linear_model = self.X.dot(self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)

        dw = (1 / self.m) * np.dot(self.X.T, (y_predicted - self.y))
        db = (1 / self.m) * np.sum(y_predicted - self.y)

        self.weights -= self.learning_rate * dw
        self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = X.dot(self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

# Test the model
lr = LogisticRegression(learning_rate=0.8, num_iterations=10000)
lr.fit(np.array(X_train), y_train)
predictions = lr.predict(X_eval.toarray())

# Confusion matrix
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, predictions))

cv_scores = cross_val_score(lr, X, y, cv=10)

print(f"10-fold Cross Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

ValueError: not enough values to unpack (expected 2, got 1)