In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vectorization/vectorization.py
/kaggle/input/reviews/star_reviews (1).db


In [3]:
import sqlite3
import pandas as pd

In [4]:
# Load in preprocessed sql database:
filename = '/kaggle/input/reviews/star_reviews (1).db'
conn = sqlite3.connect(filename)
cursor = conn.cursor()

query = "SELECT * FROM data"
df = pd.read_sql(query, conn)
print(df)

cursor.close()
conn.close()

       stars                                     processed_text
0        3.0  decid eat here, awar go take two hour begin en...
1        5.0  i'v taken lot spin class years, noth compar cl...
2        3.0  famili dinner. buffets. eclect assortment: lar...
3        5.0  now! mummy, different, delicious. favorit lamb...
4        4.0  mute interior owner (?) gave us tour come rati...
...      ...                                                ...
26995    4.0  inn mari bar area wonderful. son love chees fi...
26996    5.0  first review yelp feel share experience. move ...
26997    3.0  place perfect want stay bourbon staff nice man...
26998    3.0  aimlessli stare menu, get foggi larger! mean, ...
26999    5.0  place delightful. came parent last saturday li...

[27000 rows x 2 columns]


In [83]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
import numpy as np

In [5]:
# import .py file
import sys
sys.path.insert(1, '/kaggle/input/vectorization') 
sys.path.insert(1, '/kaggle/input/imports') 
import vectorization

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [219]:
# COMPLETE DATA
# preprocessed database into vectorization module
from vectorization import *

xraw, yraw = vectorization.sql_query(filename)

trainX, testX, trainY, testY = vectorization.vectorize(filename)
trainY = trainY.astype(int)
testY = testY.astype(int)

# Check sizes
print(trainX.shape, trainY.shape, testX.shape, testY.shape)

(21600, 25366) (21600,) (5400, 25366) (5400,)


In [220]:
# Perform dimensionality reduction to minimize computational time for CNN and make sparse matrices dense

num_components = 500
svd = TruncatedSVD(n_components=num_components)
trainX_red = svd.fit_transform(trainX)
testX_red = svd.fit_transform(testX)

In [221]:
Xtrain_tensor = torch.tensor(trainX_red, dtype=torch.float32) 
Xtest_tensor = torch.tensor(testX_red, dtype=torch.float32)
ytrain_tensor = torch.tensor(trainY.values, dtype=torch.long)  
ytest_tensor = torch.tensor(testY.values, dtype=torch.long)  

In [222]:
Xtrain_tensor = Xtrain_tensor.unsqueeze(1) # includes channels size 
Xtest_tensor = Xtest_tensor.unsqueeze(1)
ytrain_tensor -= 1 # prepares for crossentropy classification expected input 
ytest_tensor -= 1


In [223]:
Xtrain_tensor.shape

torch.Size([21600, 1, 500])

In [224]:
# Create DataLoader for batching and shuffling
train_dataset = TensorDataset(Xtrain_tensor, ytrain_tensor)
test_dataset = TensorDataset(Xtest_tensor, ytest_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True) 
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [225]:
class CNN(nn.Module):
    '''
    Convolutional neural network class with max pooling and fully connected layer
    '''
    def __init__(self, input_dim, num_classes, num_filters=200, input_channels=1):
        super(CNN, self).__init__()

        # Define convolutional layers
        self.conv = nn.Conv2d(input_channels, num_filters, (1, input_dim))

        # Fully connected layer
        self.fc = nn.Linear(num_filters, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # channel dimension

        # Apply each convolutional layer and pooling
        
        conv_out = self.conv(x)  
        conv_out = F.relu(conv_out)  
        pooled = F.max_pool2d(conv_out, (conv_out.size(2), 1))  # Max pooling
        pooled = pooled.squeeze(3)  # remove the last dimension
        
        # Flatten for the fully connected layer
        x = pooled.view(pooled.size(0), -1)  # Flatten the tensor
        x = self.fc(x)  # Fully connected layer
        return x

In [226]:
def train_scores(model, train_loader, optimizer):
    ''' Trains model with given data
    Parameters: model (NN for our case), train_loader is the torch.DataLoader which provides data and labels,
    optimizer is the torch.optimizer to update parameters
    Returns: tuple of the avg train loss and train accuracy scores 
    '''
    model.train()  
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (data, targets) in enumerate(train_loader):
        
        optimizer.zero_grad() # Zero the gradients
        # Forward pass
        outputs = model(data)
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        loss = criterion(outputs, targets)
        # Backward pass
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(targets).sum().item()
        total += targets.size(0)

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct / total
    
    return train_loss, train_accuracy

In [227]:
def evaluate(model, test_loader):
    ''' Tests model on the testing data
    Parameters: model (NN), test_loader is the DataLoader with test data and labels
    Returns: tuple with test loss and test accuracy scores
    '''
    model.eval() 
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():  # No need to track gradients during evaluation
        for batch_idx, (data, targets) in enumerate(test_loader):
            
            # Forward pass
            outputs = model(data)
            criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
            loss = criterion(outputs, targets)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(targets).sum().item()
            total += targets.size(0)

    test_loss = running_loss / len(test_loader)
    test_accuracy = 100 * correct / total
    return test_loss, test_accuracy


In [228]:
# Initialize model
model = CNN(input_dim=num_components, num_classes=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Tested different learning rates, .001 seems best


In [229]:
num_epochs = 20

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    
    # Train the model
    train_loss, train_accuracy = train_scores(model, train_loader, optimizer)
    print(f'Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')

    # Evaluate the model
    test_loss, test_accuracy = evaluate(model, test_loader)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%\n')

Epoch 1/20
Training Loss: 1.2606, Training Accuracy: 52.34%
Test Loss: 1.7333, Test Accuracy: 38.31%

Epoch 2/20
Training Loss: 1.0981, Training Accuracy: 61.25%
Test Loss: 1.8057, Test Accuracy: 37.78%

Epoch 3/20
Training Loss: 1.0591, Training Accuracy: 63.63%
Test Loss: 1.8391, Test Accuracy: 37.56%

Epoch 4/20
Training Loss: 1.0241, Training Accuracy: 66.10%
Test Loss: 1.8887, Test Accuracy: 36.00%

Epoch 5/20
Training Loss: 0.9862, Training Accuracy: 68.86%
Test Loss: 1.9217, Test Accuracy: 35.46%

Epoch 6/20
Training Loss: 0.9445, Training Accuracy: 72.19%
Test Loss: 1.9772, Test Accuracy: 35.65%

Epoch 7/20
Training Loss: 0.9003, Training Accuracy: 75.25%
Test Loss: 2.0070, Test Accuracy: 35.41%

Epoch 8/20
Training Loss: 0.8531, Training Accuracy: 78.80%
Test Loss: 2.0670, Test Accuracy: 34.39%

Epoch 9/20
Training Loss: 0.8082, Training Accuracy: 81.76%
Test Loss: 2.0914, Test Accuracy: 35.31%

Epoch 10/20
Training Loss: 0.7650, Training Accuracy: 84.51%
Test Loss: 2.1231, Te