In [4]:
import fireducks.pandas as pd
import os

# Path to the extracted chunk files (from your Kaggle dataset structure)
extracted_chunks_path = "/kaggle/input/processed-chunks-1"  # Adjust if the path differs

# Combine all chunk files
all_chunks = []
for file_name in sorted(os.listdir(extracted_chunks_path)):  # Ensure files are combined in order
    if file_name.startswith("processed_chunk_") and file_name.endswith(".csv"):
        file_path = os.path.join(extracted_chunks_path, file_name)
        print(f"Loading {file_name}...")
        chunk = pd.read_csv(file_path)
        all_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
combined_data = pd.concat(all_chunks, ignore_index=True)

# Display combined data info
print("Combined data shape:", combined_data.shape)


Loading processed_chunk_0_50000.csv...
Loading processed_chunk_1000000_1050000.csv...
Loading processed_chunk_100000_150000.csv...
Loading processed_chunk_1050000_1100000.csv...
Loading processed_chunk_1100000_1150000.csv...
Loading processed_chunk_1150000_1200000.csv...
Loading processed_chunk_1200000_1250000.csv...
Loading processed_chunk_1250000_1300000.csv...
Loading processed_chunk_1300000_1350000.csv...
Loading processed_chunk_1350000_1400000.csv...
Loading processed_chunk_1400000_1450000.csv...
Loading processed_chunk_1450000_1500000.csv...
Loading processed_chunk_1500000_1550000.csv...
Loading processed_chunk_150000_200000.csv...
Loading processed_chunk_1550000_1600000.csv...
Loading processed_chunk_1600000_1650000.csv...
Loading processed_chunk_1650000_1700000.csv...
Loading processed_chunk_1700000_1750000.csv...
Loading processed_chunk_1750000_1800000.csv...
Loading processed_chunk_1800000_1850000.csv...
Loading processed_chunk_1850000_1900000.csv...
Loading processed_chunk_1

In [5]:
# Save the combined dataset as a CSV for future use
combined_data_path = "/kaggle/working/combined_processed_data.csv"
combined_data.to_csv(combined_data_path, index=False)
print(f"Combined data saved at: {combined_data_path}")
# Load the saved combined dataset
combined_data = pd.read_csv("/kaggle/working/combined_processed_data.csv")

# Check the dataset structure
print(combined_data.info())
print(combined_data.head())
# Check label distribution
print(combined_data['label'].value_counts())

Combined data saved at: /kaggle/working/combined_processed_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review          object
 1   label           int64 
 2   cleaned_review  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None
                                              review  label  \
0  Stuning even for the non-gamer: This sound tra...      2   
1  The best soundtrack ever to anything.: I'm rea...      2   
2  Amazing!: This soundtrack is my favorite music...      2   
3  Excellent Soundtrack: I truly like this soundt...      2   
4  Remember, Pull Your Jaw Off The Floor After He...      2   

                                      cleaned_review  
0  stun non gamer sound track beautiful paint sen...  
1  good soundtrack read lot review say good game ...  
2  amazing soundtrack favorite music time hand in...  
3  excellent soundtrack t

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X = combined_data['cleaned_review']  # Features (cleaned reviews)
y = combined_data['label']           # Labels (1 for neutral, 2 for positive)

# Perform train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


Training samples: 2880000
Testing samples: 720000


# Transformer model

In [10]:
from gensim.models import Word2Vec, FastText
from sklearn.model_selection import train_test_split
import numpy as np
import multiprocessing
import gc

# Ensure all reviews are strings before splitting and handle missing values
combined_data['cleaned_review'] = combined_data['cleaned_review'].fillna('').astype(str)
X = combined_data['cleaned_review']
y = combined_data['label']

# Reduce dataset size to prevent memory issues
sample_size = min(500000, len(X))  # Limit to 500k samples if dataset is larger
X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

# Tokenize the text data
sentences_train = [review.split() for review in X_train]
sentences_test = [review.split() for review in X_test]

# Optimize training parameters for speed and memory
num_workers = multiprocessing.cpu_count() // 2  # Use half of available CPU cores
vector_size = 50  # Reduce size to optimize speed and memory
min_count = 5  # Ignore words that appear less frequently
window = 4  # Slightly smaller context window
epochs = 3  # Reduce epochs to save memory

# Train Word2Vec Model
w2v_model = Word2Vec(sentences_train, vector_size=vector_size, window=window, min_count=min_count, workers=num_workers)
w2v_model.train(sentences_train, total_examples=len(sentences_train), epochs=epochs)

# Train FastText Model
ft_model = FastText(sentences_train, vector_size=vector_size, window=window, min_count=min_count, workers=num_workers)
ft_model.train(sentences_train, total_examples=len(sentences_train), epochs=epochs)

# Function to convert reviews to vectors by averaging word embeddings
def get_avg_word_vector(words, model, vector_size):
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# Convert train and test sets into vectors
X_train_w2v = np.array([get_avg_word_vector(review, w2v_model, vector_size) for review in sentences_train], dtype=np.float32)
X_test_w2v = np.array([get_avg_word_vector(review, w2v_model, vector_size) for review in sentences_test], dtype=np.float32)

X_train_ft = np.array([get_avg_word_vector(review, ft_model, vector_size) for review in sentences_train], dtype=np.float32)
X_test_ft = np.array([get_avg_word_vector(review, ft_model, vector_size) for review in sentences_test], dtype=np.float32)

# Free memory
gc.collect()

print("Feature engineering complete.")


Feature engineering complete.


# XG Boost

In [11]:
from xgboost import XGBClassifier, DMatrix, cv
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Convert feature arrays into NumPy arrays for XGBoost
X_train_final = np.array(X_train_w2v)  # Use X_train_ft if using FastText
X_test_final = np.array(X_test_w2v)

# Ensure labels start from 0 for XGBoost
y_train_adjusted = y_train.values - 1
y_test_adjusted = y_test.values - 1

# Convert data into DMatrix (optimized for XGBoost)
dtrain = DMatrix(X_train_final, label=y_train_adjusted)
dtest = DMatrix(X_test_final, label=y_test_adjusted)

# Hyperparameter tuning (reduced search space)
param_grid = {
    'learning_rate': [0.05, 0.1], 
    'n_estimators': [200, 500],  
    'max_depth': [3, 5],  
    'min_child_weight': [1, 5],  
    'gamma': [0, 0.1],  
    'tree_method': 'hist',  
    'device': 'cuda'  # Use GPU
}

# Stratified K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_model = None
best_acc = 0

# Iterate over hyperparameters
for lr in param_grid['learning_rate']:
    for n_est in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_child_weight in param_grid['min_child_weight']:
                for gamma in param_grid['gamma']:
                    print(f"Training XGBoost with lr={lr}, n_est={n_est}, depth={max_depth}, min_child={min_child_weight}, gamma={gamma}...")

                    # Define model
                    xgb_clf = XGBClassifier(
                        learning_rate=lr,
                        n_estimators=n_est,
                        max_depth=max_depth,
                        min_child_weight=min_child_weight,
                        gamma=gamma,
                        use_label_encoder=False,
                        eval_metric='logloss',
                        tree_method='hist',
                        device='cuda'
                    )

                    # Train with early stopping
                    eval_set = [(X_train_final, y_train_adjusted), (X_test_final, y_test_adjusted)]
                    xgb_clf.fit(X_train_final, y_train_adjusted, early_stopping_rounds=10, eval_set=eval_set, verbose=False)

                    # Predictions
                    y_pred = xgb_clf.predict(X_test_final)
                    y_pred_original = y_pred + 1  # Convert back to original labels

                    # Evaluate performance
                    accuracy = accuracy_score(y_test, y_pred_original)
                    print(f"Accuracy: {accuracy:.4f}")

                    if accuracy > best_acc:
                        best_acc = accuracy
                        best_model = xgb_clf

# Final model evaluation
print("\nBest XGBoost Model:")
print(f"Accuracy: {best_acc:.4f}")
print(classification_report(y_test, best_model.predict(X_test_final) + 1))


Training XGBoost with lr=0.05, n_est=200, depth=3, min_child=1, gamma=0...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Accuracy: 0.8165
Training XGBoost with lr=0.05, n_est=200, depth=3, min_child=1, gamma=0.1...
Accuracy: 0.8165
Training XGBoost with lr=0.05, n_est=200, depth=3, min_child=5, gamma=0...




Accuracy: 0.8165
Training XGBoost with lr=0.05, n_est=200, depth=3, min_child=5, gamma=0.1...




Accuracy: 0.8165
Training XGBoost with lr=0.05, n_est=200, depth=5, min_child=1, gamma=0...




Accuracy: 0.8349
Training XGBoost with lr=0.05, n_est=200, depth=5, min_child=1, gamma=0.1...




Accuracy: 0.8349
Training XGBoost with lr=0.05, n_est=200, depth=5, min_child=5, gamma=0...




Accuracy: 0.8347
Training XGBoost with lr=0.05, n_est=200, depth=5, min_child=5, gamma=0.1...




Accuracy: 0.8347
Training XGBoost with lr=0.05, n_est=500, depth=3, min_child=1, gamma=0...




Accuracy: 0.8366
Training XGBoost with lr=0.05, n_est=500, depth=3, min_child=1, gamma=0.1...




Accuracy: 0.8366
Training XGBoost with lr=0.05, n_est=500, depth=3, min_child=5, gamma=0...




Accuracy: 0.8366
Training XGBoost with lr=0.05, n_est=500, depth=3, min_child=5, gamma=0.1...




Accuracy: 0.8366
Training XGBoost with lr=0.05, n_est=500, depth=5, min_child=1, gamma=0...




Accuracy: 0.8482
Training XGBoost with lr=0.05, n_est=500, depth=5, min_child=1, gamma=0.1...




Accuracy: 0.8480
Training XGBoost with lr=0.05, n_est=500, depth=5, min_child=5, gamma=0...




Accuracy: 0.8482
Training XGBoost with lr=0.05, n_est=500, depth=5, min_child=5, gamma=0.1...




Accuracy: 0.8482
Training XGBoost with lr=0.1, n_est=200, depth=3, min_child=1, gamma=0...




Accuracy: 0.8319
Training XGBoost with lr=0.1, n_est=200, depth=3, min_child=1, gamma=0.1...




Accuracy: 0.8319
Training XGBoost with lr=0.1, n_est=200, depth=3, min_child=5, gamma=0...




Accuracy: 0.8319
Training XGBoost with lr=0.1, n_est=200, depth=3, min_child=5, gamma=0.1...




Accuracy: 0.8319
Training XGBoost with lr=0.1, n_est=200, depth=5, min_child=1, gamma=0...




Accuracy: 0.8440
Training XGBoost with lr=0.1, n_est=200, depth=5, min_child=1, gamma=0.1...




Accuracy: 0.8440
Training XGBoost with lr=0.1, n_est=200, depth=5, min_child=5, gamma=0...




Accuracy: 0.8436
Training XGBoost with lr=0.1, n_est=200, depth=5, min_child=5, gamma=0.1...




Accuracy: 0.8436
Training XGBoost with lr=0.1, n_est=500, depth=3, min_child=1, gamma=0...




Accuracy: 0.8454
Training XGBoost with lr=0.1, n_est=500, depth=3, min_child=1, gamma=0.1...




Accuracy: 0.8454
Training XGBoost with lr=0.1, n_est=500, depth=3, min_child=5, gamma=0...




Accuracy: 0.8451
Training XGBoost with lr=0.1, n_est=500, depth=3, min_child=5, gamma=0.1...




Accuracy: 0.8451
Training XGBoost with lr=0.1, n_est=500, depth=5, min_child=1, gamma=0...




Accuracy: 0.8523
Training XGBoost with lr=0.1, n_est=500, depth=5, min_child=1, gamma=0.1...




Accuracy: 0.8522
Training XGBoost with lr=0.1, n_est=500, depth=5, min_child=5, gamma=0...




Accuracy: 0.8515
Training XGBoost with lr=0.1, n_est=500, depth=5, min_child=5, gamma=0.1...




Accuracy: 0.8515

Best XGBoost Model:
Accuracy: 0.8523
              precision    recall  f1-score   support

           1       0.86      0.85      0.85     49906
           2       0.85      0.86      0.85     50094

    accuracy                           0.85    100000
   macro avg       0.85      0.85      0.85    100000
weighted avg       0.85      0.85      0.85    100000

