In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import numpy as np
import pandas as pd
import re

In [2]:
# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')

# Preprocessing function to clean text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d', ' ', text)  # Remove digits
    text = text.lower()              # Convert to lowercase
    text = re.sub(r'\s+', ' ', text) # Remove extra spaces
    return text

# Apply preprocessing to the 'Review' column
data['content'] = data['content'].apply(preprocess_text)

# Split data into features (X) and target (y)
X = data['content']
y = data['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Limit features in TF-IDF and use sparse matrix to reduce memory usage
tfidf = TfidfVectorizer(max_features=2000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Convert sparse matrix to dense for easier batch handling
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

# Define batch size
batch_size = 10000

# Initialize SVM model with linear kernel
svm_model = SVC(kernel='linear')

# Shuffle the training data before batch processing
X_train_tfidf, y_train = shuffle(X_train_tfidf, y_train, random_state=42)

# Mini-batch training
for i in range(0, X_train_tfidf.shape[0], batch_size):
    X_batch = X_train_tfidf[i:i+batch_size]
    y_batch = y_train[i:i+batch_size]
    
    # Train SVM on the current batch
    svm_model.fit(X_batch, y_batch)
    print(f"Batch {i // batch_size + 1} trained.")

# Predict on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Calculate and print accuracy
svm_test_accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Test Accuracy with mini-batch training: {svm_test_accuracy:.4f}")

Batch 1 trained.
Batch 2 trained.
Batch 3 trained.
Batch 4 trained.
Batch 5 trained.
Batch 6 trained.
Batch 7 trained.
Batch 8 trained.
SVM Test Accuracy with mini-batch training: 0.8383
