## CNN Model Implementation

In [4]:
import sqlite3
import pandas as pd

In [5]:
# Load in preprocessed sql database:

conn = sqlite3.connect('star_reviews.db')
cursor = conn.cursor()

query = "SELECT * FROM data"
df = pd.read_sql(query, conn)
print(df)

cursor.close()
conn.close()

       stars                                     processed_text
0        3.0  decid eat here, awar go take two hour begin en...
1        5.0  i'v taken lot spin class years, noth compar cl...
2        3.0  famili dinner. buffets. eclect assortment: lar...
3        5.0  now! mummy, different, delicious. favorit lamb...
4        4.0  mute interior owner (?) gave us tour come rati...
...      ...                                                ...
26995    4.0  inn mari bar area wonderful. son love chees fi...
26996    5.0  first review yelp feel share experience. move ...
26997    3.0  place perfect want stay bourbon staff nice man...
26998    3.0  aimlessli stare menu, get foggi larger! mean, ...
26999    5.0  place delightful. came parent last saturday li...

[27000 rows x 2 columns]


In [6]:
# Vectorize the text
import vectorization

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nadia/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
# preprocessed database into vectorization module

xraw, yraw = vectorization.sql_query('star_reviews.db')

trainX, testX, trainY, testY = vectorization.vectorize('star_reviews.db')
trainY = trainY.astype(int)
testY = testY.astype(int)

# Check sizes
print(trainX.shape, trainY.shape, testX.shape, testY.shape)

(21600, 25366) (21600,) (5400, 25366) (5400,)


Turn vectorized data to tensors for pytorch usage 

In [8]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F

In [9]:
# Perform dimensionality reduction to minimize computational time for CNN, make sparse matrices dense

from sklearn.decomposition import TruncatedSVD
import numpy as np

In [7]:
num_components = 300
svd = TruncatedSVD(n_components=num_components)
trainX_red = svd.fit_transform(trainX)
testX_red = svd.fit_transform(testX)


In [8]:
print(svd.explained_variance_ratio_.sum())  # Check cumulative variance for choosing n_components above


0.6228150921655199


In [11]:
print(testX_red.shape, trainX_red.shape)

(5400, 400) (21600, 400)


In [9]:
# Convert to pytorch tensors 
Xtrain_tensor = torch.tensor(trainX_red, dtype=torch.float32) 


In [None]:
# i have issues with this cell
Xtest_tensor = torch.tensor(testX_red, dtype=torch.float32)

In [12]:
Xtest_sample = testX_red[:100]
Xtest_sample = Xtest_sample.astype(np.float32)

print("Proportion of zeros:", np.sum(Xtest_sample == 0) / Xtest_sample.size)


Proportion of zeros: 0.0


In [None]:
# testX_red = testX_red.astype('float32')
Xtest_sample = testX_red[:100]
Xtest_sample = Xtest_sample.astype(np.float32)

# Verify the data type
print(Xtest_sample.dtype)  

Xtestsample_tensor = torch.tensor(Xtest_sample, dtype=torch.float32)

float32


In [None]:
ytrain_tensor = torch.tensor(trainY, dtype=torch.long)  

In [None]:
ytest_tensor = torch.tensor(testY, dtype=torch.long)  

In [11]:
# Create DataLoader for batching and shuffling
train_dataset = TensorDataset(Xtrain_tensor, ytrain_tensor)
test_dataset = TensorDataset(Xtest_tensor, ytest_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) # batch size is the number of tokens considered
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### Define Convolutional neural network with torch.nn

In [16]:
# 1D attempt
class CNN(nn.Module):
    '''
    Convolutional neural network class with max pooling and fully connected layer
    '''
    def __init__(self, input_dim, num_classes):
        super(CNN, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)
        self.pool = nn.MaxPool1d(2)
        # Fully connected layer
        self.fc = nn.Linear(64 * ((input_dim - 2 * 3) // 2), num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))  # Conv1
        x = self.pool(x)  # Max pooling
        # Second convolution and ReLU activation
        x = F.relu(self.conv2(x))  # Conv2
        x = self.pool(x)  # Max pooling
        
        # Flatten the output of the convolutional layers for fc layer
        x = x.view(-1, 64 * ((x.size(2) - 2 * 3) // 2))  # Flatten
        x = self.fc(x) # Fully connected layer
        return x

In [16]:
# Attempt
class CNN(nn.Module):
    '''
    Convolutional neural network class with max pooling and fully connected layer
    '''
    def __init__(self, input_dim, num_classes, kernel_sizes=[3, 4, 5], num_filters=100, input_channels=1):
        super(CNN, self).__init__()

        # Define convolutional layers for different kernel sizes
        self.convs = nn.ModuleList(
            [nn.Conv2d(input_channels, num_filters, (k, input_dim)) for k in kernel_sizes]
        )

        # Fully connected layer
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)

    def forward(self, x):
        # Reshape the input tensor to match Conv2d requirements
        # Input shape: (batch_size, sequence_length, feature_dim)
        # Reshaped to: (batch_size, input_channels, sequence_length, feature_dim)
        x = x.unsqueeze(1)  # Add a channel dimension (assuming input_channels=1)

        # Apply each convolutional layer and pooling
        conv_results = []
        for conv in self.convs:
            conv_out = conv(x)  # Apply convolution
            conv_out = F.relu(conv_out)  # Apply ReLU activation
            pooled = F.max_pool2d(conv_out, (conv_out.size(2), 1))  # Max pooling
            conv_results.append(pooled.squeeze(3))  # Remove the last dimension (size 1)

        # Concatenate all results from different kernels
        x = torch.cat(conv_results, 1)  # Shape: (batch_size, num_filters * len(kernel_sizes))

        # Flatten for the fully connected layer
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)  # Fully connected layer
        return x

# more comments

class CNN(nn.Module):
    '''
    Convolutional neural network class with max pooling and fully connected layer
    '''
    def __init__(self, input_dim, num_classes, kernel_sizes=[3, 4, 5], num_filters=100):
        super(CNN, self).__init__()

        # Define convolutional layers for different kernel sizes
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (k, input_dim)) for k in kernel_sizes])

        # Fully connected layer
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)

    def forward(self, x):
        # Add an extra dimension for channel (needed for Conv2d)
        x = x.unsqueeze(1)  # Shape: (batch_size, 1, sequence_length, feature_dim)

        # Apply each convolutional layer and pooling
        conv_results = []
        for conv in self.convs:
            conv_out = conv(x)  # Shape: (batch_size, num_filters, sequence_length - kernel_size + 1, 1)
            conv_out = F.relu(conv_out)
            pooled = F.max_pool2d(conv_out, (conv_out.size(2), 1))  # Max pooling
            conv_results.append(pooled.squeeze(3))  # Remove the last dimension (size 1)

        # Concatenate all results from different kernels
        x = torch.cat(conv_results, 1)  # Shape: (batch_size, num_filters * len(kernel_sizes))

        # Fully connected layer
        x = x.view(x.size(0), -1)  # Flatten for FC layer
        x = self.fc(x)  # Shape: (batch_size, num_classes)

        return x

In [13]:
# Parameterize CNN as model
input_dim = num_components 
num_classes = 5  # Number of classes (stars)

model = CNN(input_dim=input_dim, num_classes=num_classes)

# define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)


In [12]:
# Training and Evaluation functions (dependent on tensors/trainloaders)
def train_scores(model, train_loader, optimizer):
    ''' Trains model with given data
    Parameters: model (NN for our case), train_loader is the torch.DataLoader which provides data and labels,
    optimizer is the torch.optimizer to update parameters
    Returns: tuple of the avg train loss and train accuracy scores 
    '''
    model.train()  
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (data, targets) in enumerate(train_loader):
        print(f"Batch {batch_idx + 1}")  # Print batch number to track progress
        
        print('data floated')
        # Zero the gradients
        optimizer.zero_grad()
        print('zero gradients')

        # Forward pass
        outputs = model(data)
        print('outputs')
        loss = nn.CrossEntropyLoss(outputs, targets)
        print('loss calculated')
        # Backward pass
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(targets).sum().item()
        total += targets.size(0)

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct / total
    
    return train_loss, train_accuracy


def evaluate(model, test_loader):
    ''' Tests model on the testing data
    Parameters: model (NN), test_loader is the DataLoader with test data and labels
    Returns: tuple with test loss and test accuracy scores
    '''
    model.eval() 
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():  # No need to track gradients during evaluation
        for batch_idx, (data, targets) in enumerate(test_loader):
            print(f"Test Batch {batch_idx + 1}")  # Print batch number for tracking
            # data = data.float()
            
            # Forward pass
            outputs = model(data)
            loss = nn.CrossEntropyLoss(outputs, targets)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(targets).sum().item()
            total += targets.size(0)

    test_loss = running_loss / len(test_loader)
    test_accuracy = 100 * correct / total
    return test_loss, test_accuracy


In [13]:
from tqdm import tqdm

In [14]:
# Prepare tensor for CNN input
Xtrain_tensor = Xtrain_tensor.unsqueeze(1)  # Now shape is [21600, 1, 800]
Xtrain_tensor.shape

torch.Size([21600, 1, 300])

In [15]:
Xtrain_subset = Xtrain_tensor[:int(0.1 * Xtrain_tensor.shape[0])] # trying 10% sample of data to see if size is the bottleneck 


In [None]:
# Apply model with Xtrain 
model = CNN(input_dim=num_components, num_classes=5)
output = model(Xtrain_subset)
print(output.shape)  # Expected output: (batch_size, num_classes)



In [None]:
# 
num_epochs = 10

for epoch in tqdm(range(num_epochs)):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    
    # Train the model
    train_loss, train_accuracy = train_scores(model, train_loader, optimizer)
    print(f'Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')

    # Evaluate the model
    test_loss, test_accuracy = evaluate(model, test_loader)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%\n')


In [16]:
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of test batches: {len(test_loader)}")

Number of training batches: 338
Number of test batches: 85


## Testing with random data to see if corrupt file is issue


In [11]:

import random
import string

# Function to generate random text data
def generate_random_text(num_samples=100, max_length=50):
    texts = []
    for _ in range(num_samples):
        length = random.randint(5, max_length)  # Random length for each text
        text = ''.join(random.choices(string.ascii_lowercase + ' ', k=length))  # Random text with lowercase letters and spaces
        texts.append(text.strip())
    return texts

# Generate random labels (between 1 and 5) for each sample
def generate_random_labels(num_samples=100):
    labels = [random.randint(1, 5) for _ in range(num_samples)]
    return labels

# Generate a small random dataset
num_samples = 100
texts = generate_random_text(num_samples)
labels = generate_random_labels(num_samples)

# Display the first few examples
for i in range(5):
    print(f"Text: {texts[i]}, Label: {labels[i]}")


Text: tcbm wfurtptzidzxeh, Label: 1
Text: tpbwdbpz odilivssocjk, Label: 4
Text: nnougt, Label: 3
Text: cailjujnycz blqvguhruayrdnljilyhegj gpzmzen, Label: 3
Text: tgob he easq mynsp y r jkzlflbqrsdcei, Label: 5


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the random text data
vectorizer = TfidfVectorizer(max_features=800)  # You can adjust the number of features (dimensions)
X = vectorizer.fit_transform(texts).toarray()

# Now you have your text data (X) and labels (y) to pass to the model
y = labels

# Print the shape of the data
print(f"X shape: {X.shape}, y shape: {len(y)}")


X shape: (100, 190), y shape: 100


In [13]:
X_tensor = torch.tensor(X, dtype=torch.float32)

X_tensor = X_tensor.unsqueeze(1)  # Adds a channel dimension, making the shape [100, 1, 184]

# Convert labels to tensor (assuming y is already a list or array)
y_tensor = torch.tensor(y, dtype=torch.long) 

In [17]:
model = CNN(input_dim=184, num_classes=5)


In [None]:
output = model(X_tensor)

# Print the output shape
print(output.shape)