In [None]:
# import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import os
import matplotlib.pyplot as plt
from models import *
from configs import cfg
import pandas as pd
from nltk.translate import bleu_score
import pickle


def saveVocabulary(vocabulary):
    #note loaders are saved already
    data = {}
    data['vocabulary'] = vocabulary
    print("start saving vocabulary to pickle")
    with open('vocabulary.pickle','wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) 
        print("finished writing to pickle")

def getVocabulary():
    with open('vocabulary.pickle','rb') as handle:
        data = pickle.load(handle)
    return data['vocabulary']


VOCABULARY = getVocabulary()
VOCABULARY_FLIP = dict((v,k) for k,v in VOCABULARY.items()) 

BEER_STYLE_ARRAY = ['Old Ale', 'Bière de Champagne / Bière Brut', 'American Amber / Red Ale', 'Oatmeal Stout', 'Belgian Dark Ale', 'Schwarzbier', 'Witbier', 'Weizenbock', 'English Brown Ale', 'Irish Dry Stout', 'Fruit / Vegetable Beer', 'Japanese Rice Lager', 'English Dark Mild Ale', 'Maibock / Helles Bock', 'Czech Pilsener', 'German Pilsener', 'American Pale Ale (APA)', 'Rauchbier', 'American Malt Liquor', 'American Amber / Red Lager', 'American Pale Wheat Ale', 'Märzen / Oktoberfest', 'English Porter', 'Euro Pale Lager', 'Scotch Ale / Wee Heavy', 'American Stout', 'Belgian Strong Pale Ale', 'American Brown Ale', 'Pumpkin Ale', 'Lambic - Fruit', 'Altbier', 'Bière de Garde', 'Lambic - Unblended', 'English Strong Ale', 'Sahti', 'Eisbock', 'Dortmunder / Export Lager', 'English Pale Ale', 'Gose', 'Kölsch', 'American Dark Wheat Ale', 'Berliner Weissbier', 'Euro Strong Lager', 'Low Alcohol Beer', 'English Stout', 'Rye Beer', 'American IPA', 'Happoshu', 'American Blonde Ale', 'American Adjunct Lager', 'American Black Ale', 'Black & Tan', 'California Common / Steam Beer', 'Munich Dunkel Lager', 'Munich Helles Lager', 'English Barleywine', 'Kristalweizen', 'Vienna Lager', 'Wheatwine', 'English India Pale Ale (IPA)', 'Braggot', 'Smoked Beer', 'Doppelbock', 'Milk / Sweet Stout', 'Scottish Ale', 'Cream Ale', 'Belgian Strong Dark Ale', 'Scottish Gruit / Ancient Herbed Ale', 'Faro', 'Hefeweizen', 'Dunkelweizen', 'Russian Imperial Stout', 'American Porter', 'American Strong Ale', 'Gueuze', 'Euro Dark Lager', 'Roggenbier', 'Keller Bier / Zwickel Bier', 'Extra Special / Strong Bitter (ESB)', 'American Double / Imperial Stout', 'Irish Red Ale', 'Foreign / Export Stout', 'Belgian IPA', 'English Bitter', 'English Pale Mild Ale', 'American Pale Lager', 'Baltic Porter', 'Kvass', 'Light Lager', 'Tripel', 'Flanders Red Ale', 'American Wild Ale', 'Saison / Farmhouse Ale', 'Belgian Pale Ale', 'American Double / Imperial Pilsner', 'Dubbel', 'American Double / Imperial IPA', 'Bock', 'Chile Beer', 'Herbed / Spiced Beer', 'Flanders Oud Bruin', 'Winter Warmer', 'Quadrupel (Quad)', 'American Barleywine']
BEER_STYLE_RECORD = {k: v for v, k in enumerate(BEER_STYLE_ARRAY)} # {'Old Ale': 0, 'Bière de Champagne / Bière Brut', 1 ...}

def char2oh(c):
    oh = [0] * len(VOCABULARY) #use numpy array instead of list
#     oh = np.zeros(len(VOCABULARY), 1)
    c_index = VOCABULARY[c] if c in VOCABULARY else VOCABULARY['< UNK >']
    oh[c_index] = 1
    return oh

def oh2char(oh):
    if len(oh) > len(VOCABULARY_FLIP):
        return None
    for index, e in enumerate(oh):
        if e == 1:
            return VOCABULARY_FLIP[index]
    return None
    

def load_data(fname):
    # TODO: From the csv file given by filename and return a pandas DataFrame of the read csv.
    return pd.read_csv(fname)

# save the vocabulary
"""
# save the vocabulary to pickle
vocabularySet = set()
vocabularySet.add('< SOS >')
vocabularySet.add('< EOS >')
vocabularySet.add('< UNK >')
for index, row in data.iterrows():
    review_text = list((str(row['review/text'])))
    for c in review_text:
        vocabularySet.add(c)
vocabulary_record = {k: v for v, k in enumerate(vocabularySet)}
saveVocabulary(vocabulary_record)
"""

# The data should be per mini-batch basis, https://piazza.com/class/jml6wogpji0o3?cid=450
def process_train_data(chunk, cfg):
    # TODO: Input is a pandas DataFrame and return a numpy array (or a torch Tensor/ Variable)
    # that has all features (including characters in one hot encoded form).
    # don't perform one-hot encoding here for all the dataframe, otherwise, you will get a memory issue
    # the one-hot encoding should be per mini-batch basis
    
    """
    Let the final input after concatenation of metadata feature vector and one hot encoded representation be
    a d dimensional vector. In this case, your input for processed training data will be of size N × m × d,
    and labels will be of size N × m × v, where v is the length of one hot encoded vector
    """
        
    batch_c_one_hot_list = []
    # pad the char one hot encoding
    for index, row in chunk.iterrows():
        c_one_hot_list = []
        review_text = list((str(row['review/text']))) #use cuda tensor instead list
        review_text.insert(0, '< SOS >') #TODO use tensor.cat (same as append) instead of insert
        review_text.append('< EOS >')#TODO use tensor.cat (same as append) instead of insert
        #convert each review to ohc
        for c in review_text:
            c_one_hot = char2oh(c) # 112 *1
            c_one_hot_list.append(c_one_hot)
        batch_c_one_hot_list.append(c_one_hot_list) #use cuda tensor instead
    #after getting the ohc, we have to pad to the longest review
    batch_c_one_hot_list = pad_data(batch_c_one_hot_list) #pads all the reviews to max review length

    
    
    batch_concat_one_hot_list = [] #holds all the ohc for the entire batch
    batch_label_one_hot_list = [] #same as above but shifted
    
    #concat style and rating
    for index, row in chunk.iterrows():
        beer_style, review_overall = row['beer/style'], row['review/overall']
        concat_one_hot_list = [] #store the FULL ohc for one review
        label_one_hot_list = []  #same as above but shifted the right
        # Meta feature vector should be fixed per review
        beer_style_one_hot = [0] * len(BEER_STYLE_ARRAY) #change this to np zeros
        if beer_style in BEER_STYLE_RECORD:
            beer_style_one_hot[BEER_STYLE_RECORD[beer_style]] = 1
            review_overall_one_hot = [review_overall] #review overall is a score 1-5. need to change to cuda tensor
            meta_data_feature_vector_one_hot = review_overall_one_hot + beer_style_one_hot #cuda cat
            # generate the concatenated one hot representation
            #iterate over the one hot encoded reviews from before
            for c_one_hot in batch_c_one_hot_list[index%cfg['batch_size']]: 
                #TODO use cuda
                concat_one_hot = c_one_hot + meta_data_feature_vector_one_hot # dimension 217(with concatenation), input, current char. 
                concat_one_hot_list.append(concat_one_hot) #use cuda
                # dimension 112, lable/output, next char
                label_one_hot_list.append(c_one_hot)
            batch_concat_one_hot_list.append(concat_one_hot_list)
            # label
            #TODO fix runtime for this
            label_one_hot_list = label_one_hot_list[1:] + [char2oh('< EOS >')] # shift 1 element to point to next char
            batch_label_one_hot_list.append(label_one_hot_list)
        else:
            break

    """ DEBUG PRINT
    print('batch_concat',np.array(batch_concat_one_hot_list).shape)
    print('batch_label', np.array(batch_label_one_hot_list).shape)

    output:
    batch_concat (50, 2075, 217) # 217 is concatenated vector dimension, N * m * d
    batch_label (50, 2075, 112) # 112 is the char encoded vector dimension, N * m * v
    """

    return torch.tensor(np.array(batch_concat_one_hot_list)).float(), torch.tensor(np.array(batch_label_one_hot_list)).float()
    

    
def train_valid_split(data, labels):
    # TODO: Takes in train data and labels as numpy array (or a torch Tensor/ Variable) and
    # splits it into training and validation data.
    training_percentage = 0.8 # 80 / 20 split
    training_size = len(data) * training_percentage
    X_train = data[:training_size]
    y_train = labels[:training_size]
    
    X_valid = data[training_size:]
    y_valid = labels[training_size:]

    return X_train, y_train, X_valid, y_valid


def train_valid_split(df):
    # TODO: Takes in dataframe and
    # splits it into training dataframe and validation dataframe.
    size = len(df)
    split = int(0.9*size)
    train_df = df[:split]
    val_df = df[split:]
    return train_df, val_df

    
    
def process_test_data(data):
    # TODO: Takes in pandas DataFrame and returns a numpy array (or a torch Tensor/ Variable)
    # that has all input features. Note that test data does not contain any review so you don't
    # have to worry about one hot encoding the data.
    for index, row in data.iteraterow():
        beer_style, review_overall = row['beer/style'], row['review/overall']
        concat_one_hot_list = []
        label_one_hot_list = []
        # Meta feature vector should be fixed per review
        beer_style_one_hot = [0] * len(BEER_STYLE_ARRAY)
        if beer_style in BEER_STYLE_RECORD:
            beer_style_one_hot[BEER_STYLE_RECORD[beer_style]] = 1
            review_overall_one_hot = [review_overall]
            meta_data_feature_vector_one_hot = review_overall_one_hot + beer_style_one_hot
    return None


# this function should be called inside the train function
def pad_data(orig_data):
    # TODO: Since you will be training in batches and training sample of each batch may have reviews
    # of varying lengths, you will need to pad your data so that all samples have reviews of length
    # equal to the longest review in a batch. You will pad all the sequences with <EOS> character 
    # representation in one hot encoding.
    
    """
    orig_data : 
    [
        ['oh(< SOS >)', 'oh(a)', 'oh(b)', 'oh(c)', 'oh< EOS >'],  # review 1
        ['oh(< SOS >)', 'oh(h)', 'oh(e)', 'oh(l)', 'oh(l)', 'oh(o)','oh< EOS >']  # review 2
        ['oh(< SOS >)', 'oh(y)', 'oh(e)', 'oh< EOS >']  # review 3
    ]
    """
  
    max_len = max([len(element) for element in orig_data])
    for element in orig_data: #TODO fix runtime
        l = len(element)
        if l < max_len:
            # need to pad this element
            eos_oh = char2oh('< EOS >')
            eos_to_pad = [eos_oh] * (max_len - l)
            element += eos_to_pad #TODO fix this. delegate this to the GPU
    return orig_data


def train(model, X_train, y_train, cfg):
    
    model.zero_grad()
    model.hidden = model.init_hidden() # reset the hidden state
    y_train = np.argmax(y_train, axis=2).view(-1)
    train_outputs = model(X_train) 
    train_outputs = train_outputs.view(train_outputs.shape[0]*train_outputs.shape[1], -1)
    traing_loss = loss_function(train_outputs, y_train.cuda())
    traing_loss.backward()
    optimizer.step() # weight update
    
    return traing_loss



    
def generate(model, X_test, cfg):
    # TODO: Given n rows in test data, generate a list of n strings, where each string is the review
    # corresponding to each input row in test data.
    raise NotImplementedError
    
    
def save_to_file(outputs, fname):
    # TODO: Given the list of generated review outputs and output file name, save all these reviews to
    # the file in .txt format.
    raise NotImplementedError



if __name__ == "__main__":
    
    #declare model
    model = baselineLSTM(cfg) # Replace this with model = <your model name>(cfg)
    # TODO: Train the model!
    loss_function = nn.CrossEntropyLoss()    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    if cfg['cuda']:
        print("cuda availabe")
        computing_device = torch.device("cuda")
    else:
        print("cuda not availabe")
        computing_device = torch.device("cpu")
    model.to(computing_device)
    
    train_data_fname = "Beeradvocate_Train.csv"
    test_data_fname = "Beeradvocate_Test.csv"
    out_fname = ""
    
    #load data to df
    train_data = load_data(train_data_fname) # Generating the pandas DataFrame
    test_data = load_data(test_data_fname) # Generating the pandas DataFrame
    #split df to training and validation
    train_df, val_df = train_valid_split(train_data)
    
    # train_data = train_data.head(n=100) # TODO: remove this when perform actual training
    # divide training df into even chunks
    train_df = train_df.head(n=(len(train_df) // cfg['batch_size']) * cfg['batch_size']) # get rid of extra rows for even split 
    split = len(train_df) // cfg['batch_size'] # num of minibatches
    train_data_chunks = np.array_split(train_df, split) #chunks of dataframe 
    
    val_df = val_df.head(n = 100)
    val_df = val_df.head(n=(len(val_df) // cfg['batch_size']) * cfg['batch_size']) # get rid of extra rows for even split 
    split = len(val_df) // cfg['batch_size'] # num of minibatches
    val_data_chunks = np.array_split(val_df, split) #chunks of dataframe 
    
    print(len(train_data_chunks))

    record_size = int(len(train_data_chunks) / 10) #record training accuracy every 10 chunks
    val_data, val_labels = None, None
    

    # train over each epoch
    for epoch in range(cfg['epochs']):  
        #train in chunks
        train_loss_total = 0
        
        
        for index, chunk in enumerate(train_data_chunks):
            # pre-process the data on-line
            train_data, train_labels = process_train_data(chunk, cfg)
            train_loss = train(model, train_data, train_labels, cfg)
            train_loss_total += train_loss
            

            if index != 0 and  index % record_size == 0:
                train_loss_avg = train_loss_total / record_size
                print("average train_loss: ", train_loss_avg.item())
                train_loss_total = 0
                
                val_loss_total = 0
                for val_chunk in val_data_chunks:
                    # validation loss
                    X_valid, y_valid = process_train_data(val_chunk, cfg)
                    y_valid = np.argmax(y_valid, axis=2).view(-1)
                    val_outputs = model(X_valid) 
                    val_outputs = val_outputs.view(val_outputs.shape[0]*val_outputs.shape[1], -1)
                    val_loss = loss_function(val_outputs, y_valid.cuda())
                    val_loss_total += val_loss
                val_loss_avg = val_loss_total / len(val_data_chunks) # loss per batch size
                print("average val_loss: ", val_loss_avg.item())
                
    """
    # TODO: Work on the train_valid_split
    X_train, y_train, X_valid, y_valid = train_valid_split(train_data, train_labels) # Splitting the train data into train-valid data
    
    
    # TODO: Work on the process_test_data
    X_test = process_test_data(test_data) # Converting DataFrame to numpy array
    
    model = baselineLSTM(cfg) # Replace this with model = <your model name>(cfg)
    if cfg['cuda']:
        computing_device = torch.device("cuda")
    else:
        computing_device = torch.device("cpu")
    model.to(computing_device)
    
    train(model, X_train, y_train, X_valid, y_valid, cfg) # Train the model, we should perform the encoding per mini-batch basis
    outputs = generate(model, X_test, cfg) # Generate the outputs for test data
    
    # TODO: Work on the save_to_file
    save_to_file(outputs, out_fname) # Save the generated outputs to a file
    """


init from baselineLSTM
cuda availabe
22847
