# Baseline CNN Model Notebook

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import collections
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from sklearn.utils import shuffle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import pickle
import torch.utils.data
import torch.optim as optim
from model import ConvNet, Net
import torchvision
import torchvision.transforms as transforms

### Pre-Processing the Data

In [2]:
def parseTrainData(topicLabels, df):
    data = {}
    labels = {}
    
    for dataType in ['train']:
        data[dataType] = {}
        labels[dataType] = {}
        
        for topic in topicLabels:
            data[dataType][topic] = []
            labels[dataType][topic] = []
            
            for idx, sentence in df.iterrows(): # iterates through each sentence in df we passed
                if sentence.category == topic:
                    data[dataType][topic].append(sentence.text)
                    labels[dataType][topic].append(topicLabels[sentence.category]) # assigns it the value in our dict of topicLabels based on each category
            
            assert len(data[dataType][topic]) == len(labels[dataType][topic]), \
                    "{}/{} data size does not match labels size".format(data_type, topic)
    return data, labels

At this stage, we have in read in the raw training and testing data and formatted it properly, and now we are going to shuffle the records

In [3]:
def prepareTrainData(data, labels):
    """Prepare training set from L3 Data"""
    
    # Combine pro and anti sentences and labels
    dataTrain = data['train']['pro-immigration'] + data['train']['anti-immigration'] + data['train']['pro-guns'] + data['train']['anti-guns'] + data['train']['pro-medicare'] + data['train']['oppose-medicare']
    labelsTrain = labels['train']['pro-immigration'] + labels['train']['anti-immigration'] + labels['train']['pro-guns'] + labels['train']['anti-guns'] + labels['train']['pro-medicare'] + labels['train']['oppose-medicare']
    
    # Shuffle sentences and the corresponding labels within the training data
    dataTrain, labelsTrain = shuffle(dataTrain, labelsTrain)
    
    # Return shuffled training data and training labels
    return dataTrain, labelsTrain

    

In [4]:
def prepareTestData(candidates):
    testDict = {}
    for candidate in candidates:
        df = pd.read_csv('./data/' + candidate + '_cleaned.csv')
        convertToList = df.text.to_list()
        wordsTest = [sentenceToWords(sentence) for sentence in convertToList]
        testDict[candidate] = wordsTest
    return testDict

Now we want to tokenize our input.

In [5]:
def sentenceToWords(sentence):
    nltk.download("stopwords", quiet = True)
    # stemmer = PorterStemmer()
    
    sentence = re.sub(r"[^a-zA-Z0-9]", " ", sentence.lower()) # Convert text to lower case
    words = sentence.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    # words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

Now we will apply the above method to all of our data and cache the results as well.

In [6]:
cacheDir = os.path.join("../cache", "cnn_analysis")  # where to store cache files
os.makedirs(cacheDir, exist_ok=True)  # ensure cache directory exists

def preprocessData(dataTrain, labelsTrain, cacheDir = cacheDir, cacheFile = "preprocessedData.pkl"):
    """Convert each review to words; read from cache if available."""
    
    # If cache_file is not None, try to read from it first
    cacheData = None
    if cacheFile is not None:
        try:
            pass
        except:
            pass  # unable to read from cache, but that's okay
        
      # If cache is missing, then do the heavy lifting
    if cacheData is None:
        # Preprocess training and test data to obtain words for each review
        # words_train = list(map(review_to_words, data_train))
        # words_test = list(map(review_to_words, data_test))
        wordsTrain = [sentenceToWords(sentence) for sentence in dataTrain]
        
        # Write to cache file for future runs
        if cacheFile is not None:
            cacheData = dict(wordsTrain=wordsTrain, labelsTrain=labelsTrain)
            
            with open(os.path.join(cacheDir, cacheFile), "wb") as f:
                pickle.dump(cacheData, f)
            print("Wrote preprocessed data to cache file:", cacheFile)
        
    else:
        # Unpack data loaded from cache file
        wordsTrain, labelsTrain = (cacheData['wordsTrain'], cacheData['labelsTrain'])
        
    return wordsTrain, labelsTrain


### Transform the Data

We will construct a feature representation that represents each word as an integer and include the words that appear most frequently.  We will combine all the infrequent words into another category by itself.  

In [7]:
def buildDict(data, vocabSize = 50000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    # Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    # sentence is a list of words.
    
    # A dict storing the words that appear in the reviews along with how often they occur
    wordCount = {}
    for sentence in data:
        for word in sentence:
            wordCount[word] = wordCount[word] + 1 if word in wordCount else 1
            
    # Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    # sorted_words[-1] is the least frequently appearing word. 
    wordCountSorted = sorted(wordCount.items(), key=(lambda item: item[1]), reverse=True)
    sortedWords = [item[0] for item in wordCountSorted]
    
    # This is what we are building, a dictionary that translates words into integers
    wordDict = {}
    for idx, word in enumerate(sortedWords[:vocabSize - 2]): # The -2 is so that we save room for the 'no word'
        wordDict[word] = idx + 2
    return wordDict

Now we have our word dictionary, so let's convert our sentences to integer sequence representation and pad our results to a fixed length.  

In [8]:
def convertAndPad(wordDict, sentence, pad = 500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1  # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    workingSentence = [NOWORD] * pad
    
    for wordIdx, word in enumerate(sentence[:pad]):
        if word in wordDict:
            workingSentence[wordIdx] = wordDict[word]
        else:
            workingSentence[wordIdx] = INFREQ
        
    return workingSentence, min(len(sentence), pad)

def convertAndPadData(wordDict, data, pad = 500):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convertAndPad(wordDict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
    
    return np.array(result), np.array(lengths)

### Upload the Data

### Build and Train the PyTorch Model

In [9]:
def train(model, trainLoader, epochs, optimizer, criterion, device):
    totalStep = len(trainLoader)
    for epoch in range(epochs):
        for idx, (data, labels) in enumerate(trainLoader):
            data = data.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(data)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (id+1) % 100 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, epochs, id+1, totalStep, loss.item()))


### Deploy the Model for Testing

### Use the Model for Testing

## Main Function for Calls

In [12]:
def main():
    # Initial Variables
    topicLabels = {
        'pro-immigration': 0,
        'anti-immigration': 1,
        'pro-guns': 2,
        'anti-guns': 3,
        'pro-medicare': 4,
        'oppose-medicare': 5
    }
    candidates = ['Biden', 'Buttigieg', 'Klobuchar', 'Sanders', 'Warren', 'Yang']
    Out.clear()
    
    data, labels = parseTrainData(topicLabels, df = pd.read_csv('./data/L3train_nonOneHot.csv'))
    
    # Let's check out the length of our training data for each category
    print("Immigration articles: train = {} pro / {} anti".format(
            len(data['train']['pro-immigration']), len(data['train']['anti-immigration'])))
    
    print("Immigration articles: train = {} pro / {} anti".format(
            len(data['train']['pro-guns']), len(data['train']['anti-guns'])))
    
    print("Immigration articles: train = {} pro / {} anti".format(
            len(data['train']['pro-medicare']), len(data['train']['oppose-medicare'])))
    
    # Below is our prepared shuffled training data
    trainX, trainY = prepareTrainData(data, labels)
    print(collections.Counter(trainY))
    # Below is our test set
    testX = prepareTestData(candidates)
    
    print("Sentences (combined): train = {}".format(len(trainX)))
    
    # Let's go check out an example from our training data now
    print(trainX[100])
    
    # Applying the tokenizer to get the stems
    print(sentenceToWords(trainX[1]))
    # Now we preprocess the Data
    trainX, trainY = preprocessData(trainX, trainY)
    # Build our word Dict
    wordDict = buildDict(trainX)
    print(list(wordDict.keys())[:5])
    
    # Save our wordDict
    dataDir = '../data/pytorch' # folder to store our data
    if not os.path.exists(dataDir): # check to make sure folder exists
        print("making dataDir folder...")
        os.makedirs(dataDir)
    
    with open(os.path.join(dataDir, 'wordDict.pkl'), 'wb') as f:
        pickle.dump(wordDict, f)
    
    # Now we pad all the sentences in our training data
    trainXNum, trainXLen = convertAndPadData(wordDict, trainX)
    
    # Pad all the sentences in our testing data?
    
    # Save our data locally
    pd.concat([pd.DataFrame(trainY), pd.DataFrame(trainXLen), pd.DataFrame(trainXNum)], axis=1) \
        .to_csv(os.path.join(dataDir, 'train.csv'), header=False, index=False)
    
    # Let's load a small portion of our training dataset for testing
    trainSample = pd.read_csv(os.path.join(dataDir, 'train.csv'), header=None, names=None, nrows=250)
    
    # Turn input df into tensors
    trainSampleY = torch.from_numpy(trainSample[[0]].values).float().squeeze()
    trainSampleX = torch.from_numpy(trainSample.drop([0], axis = 1).values).long()
    
    # Build the dataset
    trainSampleDS = torch.utils.data.TensorDataset(trainSampleX, trainSampleY)
    # Build the dataloader
    trainSampleDL = torch.utils.data.DataLoader(trainSampleDS, batch_size=50)
    
    # Train the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = ConvNet(10).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    train(model, trainSampleDL, 5, optimizer, criterion, device)
if __name__ == "__main__":
    main()

Immigration articles: train = 900 pro / 385 anti
Immigration articles: train = 759 pro / 745 anti
Immigration articles: train = 673 pro / 639 anti
Counter({0: 900, 2: 759, 3: 745, 4: 673, 5: 639, 1: 385})
Sentences (combined): train = 4101
That does not, of course, mean that farmers in Kansas and Colorado and California should be stripped of their ability to keep a firearm for pests or hunting on their property.
['along', 'overflowing', 'access', 'guns', 'honduras', 'remain', 'high', 'death', 'rates', 'gun', 'violence']
Wrote preprocessed data to cache file: preprocessedData.pkl
['gun', 'would', 'health', 'immigrants', 'medicare']


RuntimeError: Expected 4-dimensional input for 4-dimensional weight 6 3 5 5, but got 2-dimensional input of size [50, 501] instead