In [None]:
import pandas as pd
import gzip
import json
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Gift_Cards_5.json.gz')
df = df[df['reviewText'].notna()]

#from spell_check import fixSentence
#df['reviewText'] = df['reviewText'].apply(lambda x: fixSentence(x))

#print('CHECKPOINT: Spell Check Complete')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(df['reviewText'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_counts, df['overall'], test_size = 0.25, random_state = 5)

In [None]:
"""
Basic CNNModel structure
"""
class BasicCNNModel(torch.nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__() 
        self.conv1 = nn.Conv2d(3, 6, 4)
        self.conv2 = nn.Conv2d(6, 6, 4)

        self.lin1 = nn.Linear(6 * 53 * 53, 6 * 53, bias = True)
        self.lin2 = nn.Linear(6 * 53, len(train_dataset.classes), bias = True)

        self.pool = nn.MaxPool2d(2, 2)
        self.ReLU = nn.ReLU()
      
    def forward(self, x):
        x = self.conv1(x)
        x = self.ReLU(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.ReLU(x)
        x = self.pool(x)
        x = x.view(-1, 6 * 53 * 53)
        x = self.lin1(x)
        x = self.ReLU(x)
        x = self.lin2(x)
        return x

basic_cnn_model = BasicCNNModel()

In [None]:
"""
'faster' CNN model (taken from github)
Note they used bigrams as inputs, not word 2 vec
"""
class FastCNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

In [None]:
"""
CNN that utilizes dropout (also taken from github)
unclear what format they use as inputs for the CNN, look into more

author does seem to have some interesting approaches to preprocessing and spellcheck
try and test but looks ugly

https://github.com/linzehui/pytorch-SentimentAnalysis/blob/master/data.py
"""
class DropOutCNN(nn.Module):
    def __init__(self, nembedding, vocab_size, kernel_num, kernel_sizes, label_size,
                 dropout=0.3, use_pretrain=False, embed_matrix=None, embed_freeze=False):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, nembedding)
        if use_pretrain is True:
            self.embedding.weight = nn.Parameter(torch.from_numpy(embed_matrix).type(torch.FloatTensor),
                                                 requires_grad=not embed_freeze)
        self.in_channel = 1
        self.out_channel = kernel_num
        self.kernel_sizes = kernel_sizes
        self.kernel_num = kernel_num
        self.convs1 = nn.ModuleList([nn.Conv2d(self.in_channel, self.out_channel, (K, nembedding))
                                     for K in self.kernel_sizes])  # kernel_sizes,like (3,4,5)

        self.dropout = nn.Dropout(dropout)
        """
        in_feature=len(kernel_sizes)*kernel_num,because we concatenate 
        """
        self.fc = nn.Linear(len(kernel_sizes) * kernel_num, label_size)

    def forward(self, sequences):
        padded_sentences, lengths = pad_packed_sequence(sequences, padding_value=int(0),
                                                        batch_first=True)  # set batch_first true
        x = self.embedding(padded_sentences)  # batch_size*num_word*nembedding

        x = x.unsqueeze(1)  # (batch_size,1,num_word,nembedding)   1 is in_channel

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # a list containing (batch_size,out_channel,W)

        x = [F.max_pool1d(e, e.size(2)).squeeze(2) for e in
             x]  # max_pool1d(input, kernel_size),now x is a list of (batch_size,out_channel)

        x = torch.cat(x, dim=1)  # concatenate , x is batch_size,len(kernel_sizes)*kernel_num

        x = self.dropout(x)
        logits = self.fc(x)

        return logits


In [None]:
"""
Basic train loop for cnn
"""
def train_cnn(model, optimizer, criterion, epochs=10):
    for child in model.children():
      if hasattr(child, 'reset_parameters'):
        child.reset_parameters()
    
    model.to(device)
    model.train()

    accuracy_list = []
    loss_list = []
    for epoch in range(epochs):
      losses = []
      correct = 0
      total = 0
      for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        total += labels.size(0)

        optimizer.zero_grad()

        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()
        losses.append(loss.item())

      epoch_acc = correct / total
      epoch_loss = sum(losses) / len(losses)
      accuracy_list.append(epoch_acc)
      loss_list.append(epoch_loss)
      print('Epoch Num: %d, Accuracy: %.4f, Loss: %.4f' % (epoch + 1, epoch_acc, epoch_loss))

    final_training_accuracy = accuracy_list[-1]     
    final_training_loss = loss_list[-1]
    return final_training_loss, final_training_accuracy