In [1]:
# Reference https://github.com/danwild/sagemaker-sentiment-analysis/blob/master/SageMaker%20Project.ipynb

In [2]:
import os
import glob
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import numpy as np
import pickle
import sagemaker
import torch
import torch.utils.data
import torch.optim as optim
from train.model import LSTMClassifier
from sagemaker.pytorch import PyTorch
import collections
from sklearn.utils import shuffle

In [3]:
label_dict = {'pro-immigration':0, 'anti-immigration':1, 'pro-guns':2, 'anti-guns':3, 'pro-medicare':4, 'oppose-medicare':5}

In [4]:
# HELPER FUNCTION: GOAL, read in training data and format into lists

def read_train_data(path='./data/L3train_nonOneHot.csv'):
    data = {}
    labels = {}
    
    for data_type in ['train']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for target in ['pro-immigration', 'anti-immigration', 'pro-guns', 'anti-guns', 'pro-medicare', 'oppose-medicare']:
            data[data_type][target] = []
            labels[data_type][target] = []
            
            df = pd.read_csv(path)
            
            for index, row in df.iterrows():
                if row.category == target:
                    data[data_type][target].append(row.text)
                    labels[data_type][target].append(label_dict[row.category])
                    
            assert len(data[data_type][target]) == len(labels[data_type][target]), \
                    "{}/{} data size does not match labels size".format(data_type, target)
                
    return data, labels

In [5]:
data, labels = read_train_data()

print("Immigration Articles: train = {} pro / {} anti".format(
            len(data['train']['pro-immigration']), len(data['train']['anti-immigration'])))

print("Gun Articles: train = {} pro / {} anti".format(
            len(data['train']['pro-guns']), len(data['train']['anti-guns'])))

print("Medicare Articles: train = {} pro / {} anti".format(
            len(data['train']['pro-medicare']), len(data['train']['oppose-medicare'])))

Immigration Articles: train = 900 pro / 385 anti
Gun Articles: train = 759 pro / 745 anti
Medicare Articles: train = 673 pro / 639 anti


In [6]:
def combine_train_data(data, labels):
    """Prepare training dataset with L3 data. Be sure to shuffle."""
    
    # Combine all categories together
    data_train = data['train']['pro-immigration'] + data['train']['anti-immigration'] + data['train']['pro-guns'] + data['train']['anti-guns'] + data['train']['pro-medicare'] + data['train']['oppose-medicare'] 
    labels_train = labels['train']['pro-immigration'] + labels['train']['anti-immigration'] + labels['train']['pro-guns'] + labels['train']['anti-guns'] + labels['train']['pro-medicare'] + labels['train']['oppose-medicare'] 
    
    # Shuffle data
    data_train, labels_train = shuffle(data_train, labels_train)
    
    # Return a unified training data, training labels
    return data_train, labels_train

In [7]:
train_X, train_y = combine_train_data(data, labels)
print("Full dataset (combined): ", len(train_X))

Full dataset (combined):  4101


In [8]:
train_X[33], train_y[33]

("The last time that limiting immigration was on the U.S. legislative agenda, in the mid-1990s, Barbara Jordan's commission suggested limiting family immigration and eliminating the visa lottery, which gives out visas based on chance.",
 1)

In [9]:
collections.Counter(train_y)

Counter({5: 639, 4: 673, 1: 385, 0: 900, 3: 745, 2: 759})

In [283]:
def sent_to_words(text):
    nltk.download("stopwords", quiet=True)
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    
    return words

In [284]:
sent_to_words(train_X[1])

['immigrants',
 'likely',
 'start',
 'business',
 'native',
 'born',
 'americans',
 'whether',
 'corner',
 'shop',
 'high',
 'tech',
 'startup']

In [285]:
words_train = [sent_to_words(sent) for sent in train_X]

In [286]:
cache_dir = os.path.join("../cache", "lstm_baseline_single_label")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, labels_train, cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each sentence to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        # words_train = list(map(review_to_words, data_train))
        # words_test = list(map(review_to_words, data_test))
        words_train = [sent_to_words(text) for text in data_train]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, labels_train=labels_train)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, labels_train= (cache_data['words_train'],cache_data['labels_train'])
    
    return words_train, labels_train

In [287]:
# Preprocess data
train_X, train_y= preprocess_data(train_X, train_y)

Read preprocessed data from cache file: preprocessed_data.pkl


# Preprocess Test Data

In [288]:
test_dict = {}
candidates = ['Biden','Sanders','Buttigieg','Klobuchar','Yang','Warren']
for c in candidates: 
    df = pd.read_csv('./data/' + c + '_test.csv')
    l = df.text.to_list()
    words_test = [sent_to_words(sent) for sent in l]
    test_dict[c] = words_test


# Transform Input

In [289]:
def build_dict(data, vocab_size = 50000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    # sentence is a list of words.
    
    # A dict storing the words that appear in the reviews along with how often they occur
    word_count = {} 
    
    # tally up the word counts
    for sentence in data:
        for word in sentence:
            word_count[word] = word_count[word] + 1 if word in word_count else 1
    
    # Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    # sorted_words[-1] is the least frequently appearing word. 
    word_count_sorted = sorted(word_count.items(), key=(lambda item: item[1]), reverse=True)
    sorted_words = [item[0] for item in word_count_sorted]
    
    # This is what we are building, a dictionary that translates words into integers
    word_dict = {} 
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [290]:

def convert_and_pad(word_dict, sentence, pad=128):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=128):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [291]:
word_dict = build_dict(train_X)

In [292]:
list(word_dict.keys())[:5]

['gun', 'would', 'health', 'immigrants', 'medicare']

In [293]:
## Save Progress so far

data_dir = '../data/pytorch' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    print("making folder")
    os.makedirs(data_dir)

In [294]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    print("Pickling file")
    pickle.dump(word_dict, f)

Pickling file


In [295]:
train_X_num, train_X_len = convert_and_pad_data(word_dict, train_X)

In [296]:
print(train_X[1])
print(train_X_num[1])

['report', 'national', 'academies', 'sciences', 'engineering', 'medicine', 'found', 'little', 'effect', 'wages', 'employment', 'native', 'born', 'workers', 'long', 'term', 'undocumented', 'immigrants']
[ 417   38 1289 1057 1459  589   50  590  222  199  763  108   49   42
  124  433  362    5    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [297]:
max([len(x) for x in train_X_num])

128

In [298]:
test_dict_num = {}
test_dict_len = {}

for key, val in test_dict.items():
    test_dict_num[key], test_dict_len[key] = convert_and_pad_data(word_dict, val)

# Upload Data to Sagemaker

In [299]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X_num)], axis=1) \
        .to_csv(os.path.join(data_dir, 'lstm_train.csv'), header=False, index=False)

In [300]:
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sentiment_rnn'

role = sagemaker.get_execution_role()

In [301]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

# Build and Train Pytorch Model

In [302]:
!pygmentize train/model_lstm.py

[34mimport[39;49;00m [04m[36mtorch.nn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m

[34mclass[39;49;00m [04m[32mLSTMClassifier[39;49;00m(nn.Module):
    [33m"""[39;49;00m
[33m    This is the simple RNN model we will be using to perform Sentiment Analysis.[39;49;00m
[33m    """[39;49;00m

    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, embedding_dim, hidden_dim, vocab_size):
        [33m"""[39;49;00m
[33m        Initialize the model by settingg up the various layers.[39;49;00m
[33m        """[39;49;00m
        [36msuper[39;49;00m(LSTMClassifier, [36mself[39;49;00m).[32m__init__[39;49;00m()

        [36mself[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=[34m0[39;49;00m)
        [36mself[39;49;00m.lstm = nn.LSTM(embedding_dim, hidden_dim)
        [36mself[39;49;00m.dense = nn.Linear(in_features=hidden_dim, out_features=[34m6[39;49;00m)
        [36mself[39;49;00m.sm = nn.Softm

In [303]:
import torch
import torch.utils.data

# Read in only the first 500 rows for testing
train_sample = pd.read_csv(os.path.join(data_dir, 'lstm_train.csv'), header=None, names=None, nrows=500)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).long().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

In [304]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # TODO: Complete this train method to train the model provided.
            optimizer.zero_grad()
            output = model(batch_X)
            loss = loss_fn(output, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Epoch: {}, Cross Entropy Loss: {}".format(epoch, total_loss / len(train_loader)))

In [305]:
for bt in train_sample_dl:
    bX, bY = bt
    
    bX = bX.to(device)
    bY = bY.to(device)
    
    model = LSTMClassifier(100, 100, 50000).to(device)
    optimizer.zero_grad()
    output = model(bX)
    print(output)
    print(output.shape)

tensor([0.5399, 0.4844, 0.5216, 0.5143, 0.5339, 0.5095, 0.5003, 0.5215, 0.4973,
        0.5348, 0.5091, 0.5351, 0.4904, 0.4757, 0.4780, 0.5096, 0.5162, 0.5270,
        0.5009, 0.5387, 0.4974, 0.5172, 0.5114, 0.4826, 0.5006, 0.5193, 0.5252,
        0.4919, 0.4701, 0.5444, 0.5078, 0.4917, 0.5150, 0.4935, 0.5174, 0.5008,
        0.5335, 0.5035, 0.4914, 0.4994, 0.5127, 0.5253, 0.4919, 0.4972, 0.5186,
        0.5327, 0.5170, 0.5286, 0.5063, 0.5044], grad_fn=<SigmoidBackward>)
torch.Size([50])
tensor([0.5326, 0.5108, 0.5407, 0.5556, 0.5119, 0.5275, 0.5047, 0.4966, 0.5418,
        0.4911, 0.5362, 0.5223, 0.5032, 0.4934, 0.5104, 0.4970, 0.5074, 0.4871,
        0.4821, 0.5383, 0.5282, 0.4995, 0.5558, 0.5327, 0.4777, 0.4990, 0.4908,
        0.5090, 0.5227, 0.5202, 0.5367, 0.5060, 0.4865, 0.5290, 0.5072, 0.5074,
        0.5037, 0.5131, 0.4944, 0.5165, 0.5249, 0.4974, 0.5202, 0.5186, 0.4827,
        0.4931, 0.5206, 0.5326, 0.4893, 0.5257], grad_fn=<SigmoidBackward>)
torch.Size([50])
tensor([0.4467

In [306]:
import torch.optim as optim
from train.model_lstm import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(100, 100, 50000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

train(model, train_sample_dl, 5, optimizer, loss_fn, device)

  return self.sm(out.squeeze())


Epoch: 1, Cross Entropy Loss: 1.7716382980346679
Epoch: 2, Cross Entropy Loss: 1.669167399406433
Epoch: 3, Cross Entropy Loss: 1.2528348803520202
Epoch: 4, Cross Entropy Loss: 1.0678372859954834
Epoch: 5, Cross Entropy Loss: 1.0534244656562806


# Now do it on the full training dataset!

In [307]:
estimator = PyTorch(entry_point="train_lstm.py",
                    source_dir="train",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 200,
                    })

In [308]:
estimator.fit({'training': input_data})

2020-02-27 02:26:05 Starting - Starting the training job...
2020-02-27 02:26:06 Starting - Launching requested ML instances...
2020-02-27 02:27:05 Starting - Preparing the instances for training......
2020-02-27 02:28:05 Downloading - Downloading input data...
2020-02-27 02:28:35 Training - Downloading the training image.....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-02-27 02:29:13,549 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-02-27 02:29:13,574 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-02-27 02:29:16,677 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-02-27 02:29:16,932 sagemaker-containers INFO     Module train_lstm does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-02-27 02:29:16,932 sagemaker-containers INF

In [309]:
# TODO: Deploy the trained model
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-----------------!

In [310]:
# Prep test datasets for evaluation

biden_X = pd.concat([pd.DataFrame(test_dict_len["Biden"]), pd.DataFrame(test_dict_num["Biden"])], axis=1)
buttigieg_X = pd.concat([pd.DataFrame(test_dict_len["Buttigieg"]), pd.DataFrame(test_dict_num["Buttigieg"])], axis=1)
sanders_X = pd.concat([pd.DataFrame(test_dict_len["Sanders"]), pd.DataFrame(test_dict_num["Sanders"])], axis=1)
warren_X = pd.concat([pd.DataFrame(test_dict_len["Warren"]), pd.DataFrame(test_dict_num["Warren"])], axis=1)
yang_X = pd.concat([pd.DataFrame(test_dict_len["Yang"]), pd.DataFrame(test_dict_num["Yang"])], axis=1)
klobuchar_X = pd.concat([pd.DataFrame(test_dict_len["Klobuchar"]), pd.DataFrame(test_dict_num["Klobuchar"])], axis=1)
train_check_X = pd.concat([pd.DataFrame(train_X_len), pd.DataFrame(train_X_num)], axis=1)

In [367]:
def get_stance(candidate_data):
    predictions = predictor.predict(candidate_data)
    pred = np.argmax(predictions, axis=1)
    agg = collections.Counter(pred)
    
    immigration_stance = (((agg[0] - agg[1]) * 1.0 / (agg[0] + agg[1])) + 1) * 0.5
    gun_stance = (((agg[2] - agg[3]) * 1.0 / (agg[2] + agg[3])) + 1) * 0.5
    medicare_stance = (((agg[4] - agg[5]) * 1.0 / (agg[4] + agg[5])) + 1) * 0.5
    
    immigration_importance = (agg[0] + agg[1]) * 1.0 / pred.shape[0]
    guns_importance = (agg[2] + agg[3]) * 1.0 / pred.shape[0]
    medicare_importance = (agg[4] + agg[5]) * 1.0 / pred.shape[0]
    
    print('Candidate stance on immigration:', immigration_stance, ".... Relative Importance:", immigration_importance)
    print('Candidate stance on guns:', gun_stance, ".... Relative Importance:", guns_importance)
    print('Candidate stance on medicare:', medicare_stance, ".... Relative Importance:", medicare_importance)

In [368]:
print('Biden')

get_stance(biden_X)

Biden
Candidate stance on immigration: 1.0 .... Relative Importance: 0.2729007633587786
Candidate stance on guns: 0.20431893687707642 .... Relative Importance: 0.38295165394402036
Candidate stance on medicare: 0.011090573012938976 .... Relative Importance: 0.344147582697201


In [369]:
print('Sanders')

get_stance(sanders_X)

Sanders
Candidate stance on immigration: 1.0 .... Relative Importance: 0.3139050791007494
Candidate stance on guns: 0.1939252336448598 .... Relative Importance: 0.3563696919233972
Candidate stance on medicare: 0.012626262626262652 .... Relative Importance: 0.32972522897585343


In [370]:
print('Buttigieg')

get_stance(buttigieg_X)

Buttigieg
Candidate stance on immigration: 1.0 .... Relative Importance: 0.28063241106719367
Candidate stance on guns: 0.11671087533156499 .... Relative Importance: 0.3725296442687747
Candidate stance on medicare: 0.011396011396011374 .... Relative Importance: 0.3468379446640316


In [373]:
print('Warren')

get_stance(warren_X)

Warren
Candidate stance on immigration: 0.9928315412186379 .... Relative Importance: 0.2818181818181818
Candidate stance on guns: 0.16969696969696968 .... Relative Importance: 0.3333333333333333
Candidate stance on medicare: 0.005249343832021025 .... Relative Importance: 0.38484848484848483


In [374]:
print('Yang')

get_stance(yang_X)

Yang
Candidate stance on immigration: 1.0 .... Relative Importance: 0.2835820895522388
Candidate stance on guns: 0.14321608040201006 .... Relative Importance: 0.39601990049751246
Candidate stance on medicare: 0.012422360248447228 .... Relative Importance: 0.32039800995024875


In [375]:
print('Klobuchar')

get_stance(klobuchar_X)

Klobuchar
Candidate stance on immigration: 0.9893048128342246 .... Relative Importance: 0.31140716069941715
Candidate stance on guns: 0.13235294117647056 .... Relative Importance: 0.33971690258118237
Candidate stance on medicare: 0.01670644391408116 .... Relative Importance: 0.3488759367194005


# Compare to Mr. President

In [381]:
df_trump = pd.read_csv('./data/trump_tweets.csv')
l = df_trump.text.to_list()
words_test_trump = [sent_to_words(sent) for sent in l]

trump_dict_num, trump_dict_len = convert_and_pad_data(word_dict, words_test_trump)

In [387]:
def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([[None, None, None, None, None, None]])
    for array in split_array:
        predictions = np.concatenate((predictions, predictor.predict(array)), axis=0)
    
    return predictions

In [388]:
def get_stance_from_pred(predictions):
    pred = np.argmax(predictions, axis=1)
    agg = collections.Counter(pred)
    
    immigration_stance = (((agg[0] - agg[1]) * 1.0 / (agg[0] + agg[1])) + 1) * 0.5
    gun_stance = (((agg[2] - agg[3]) * 1.0 / (agg[2] + agg[3])) + 1) * 0.5
    medicare_stance = (((agg[4] - agg[5]) * 1.0 / (agg[4] + agg[5])) + 1) * 0.5
    
    immigration_importance = (agg[0] + agg[1]) * 1.0 / pred.shape[0]
    guns_importance = (agg[2] + agg[3]) * 1.0 / pred.shape[0]
    medicare_importance = (agg[4] + agg[5]) * 1.0 / pred.shape[0]
    
    print('Candidate stance on immigration:', immigration_stance, ".... Relative Importance:", immigration_importance)
    print('Candidate stance on guns:', gun_stance, ".... Relative Importance:", guns_importance)
    print('Candidate stance on medicare:', medicare_stance, ".... Relative Importance:", medicare_importance)

In [391]:
trump_X = pd.concat([pd.DataFrame(trump_dict_len), pd.DataFrame(trump_dict_num)], axis=1)

agg_trump = predict(trump_X)

Trump


In [393]:
print('Trump')
get_stance_from_pred(agg_trump[1:])

Trump
Candidate stance on immigration: 0.9959083469721768 .... Relative Importance: 0.180112017293898
Candidate stance on guns: 0.04207369323050558 .... Relative Importance: 0.5733516753463692
Candidate stance on medicare: 0.04942208051016339 .... Relative Importance: 0.24653630735973273


# Don't Forget to Delete!!!

In [394]:
predictor.delete_endpoint()