In [None]:
import torch
import torch.nn as nn
import numpy as np
import re

class Net(nn.Module):
    def __init__(self, input_feature, input_length, num_classes, drop_prob, mode='small'):
        super(Net, self).__init__()
        np.random.seed(99)
        self.mode = mode
        if self.mode == 'small':
            l6_frame_length = int((input_length - 96)/27)
            self.conv = nn.Sequential(
                nn.Conv1d(input_feature, 256, 7, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3),

                nn.Conv1d(256, 256, 7, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3)
            )

            self.fc = nn.Sequential(
                nn.Linear(l6_frame_length * 256, 1024),
                nn.Dropout(p=drop_prob),
                nn.Linear(1024, 1024),
                nn.Dropout(p=drop_prob),
                nn.Linear(1024, num_classes)
            )
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        out = self.conv(x)
        out = out.view(len(x), -1)
        out = self.fc(out)
        return self.softmax(out)


In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from nltk.corpus import wordnet
from collections import OrderedDict
import nltk
import pickle
import os
import copy
import re

data_addr = "/content/drive/MyDrive/Data_CharacterConvNet"
nltk.download('wordnet')

alphabet_dic = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6, 'h':7, 'i':8, 'j':9, 'k':10, 'l':11, 'm':12, 'n':13, 'o':14, 'p':15, 'q':16, 'r':17, 's':18, 't':19, 'u':20, 'v':21, 'w':22, 'x':23, 'y':24, 'z':25,
 '0':26, '1':27, '2':28, '3':29, '4':30, '5':31, '6':32, '7':33, '8':34, '9':35,
 "-":36, ',':37, ';':38, '.':39, '!':40, '?':41, ':':42, '\'':43, '"':44, '/':45, '\\':46, '|':47, '_':48, '@':49, '#':50, '$':51, '%':52, 'ˆ':53, '&':54, '*':55, '~':56, '`':57, '+':58, "-":59, '=':60, '<':61, '>':62, '(':63, ')':64, '[':65, ']':66, '{':67, '}':68, '\n': 69}

alphabet_length = len(alphabet_dic)
print(alphabet_length)
char_length = 1014

def get_data(path, col_num, augment_data, doLower):
    global data_addr
    global alphabet_dic
    global alphabet_length
    global char_length

    def str_lower(str):
        return str.lower()

    def to_str(num):
        return str(num)

    def label_process(label):
        return label-1

    full_addr = data_addr + path
    if not os.path.isfile(data_addr+path+'/pickles'+'train_basic.pickle'):
        if col_num == 3:
            train_csv = pd.read_csv(full_addr+'/train.csv', names=['label', 'title', 'data'])
            test_csv = pd.read_csv(full_addr+'/test.csv', names=['label', 'title', 'data'])
            train_csv['title'] = train_csv['title'].transform(to_str)
            test_csv['title'] = test_csv['title'].transform(to_str)
            train_csv['full data'] = train_csv[['title', 'data']].apply(''.join, axis=1)
            test_csv['full data'] = test_csv[['title', 'data']].apply(''.join, axis=1)
            if doLower:
                train_csv['full data'] = train_csv['full data'].transform(str_lower)
                test_csv['full data'] = test_csv['full data'].transform(str_lower)
            train_csv['label'] = train_csv['label'].transform(label_process)
            test_csv['label'] = test_csv['label'].transform(label_process)
            train_df = [[row['full data'], row['label']] for index,row in train_csv.iterrows()]
            test_df = [[row['full data'], row['label']] for index,row in test_csv.iterrows()]
        elif col_num == 4:
            train_csv = pd.read_csv(full_addr+'/train.csv', names=['label', 'title', 'question', 'answer'])
            test_csv = pd.read_csv(full_addr+'/test.csv', names=['label', 'title', 'question', 'answer'])
            print(test_csv['question'].head())
            print(test_csv['title'].head())
            print(test_csv['answer'].head())
            train_csv['title'] = train_csv['title'].transform(to_str)
            print(len(train_csv['answer']))
            train_csv['question'] = train_csv['question'].transform(to_str)
            train_csv['answer'] = train_csv['answer'].transform(to_str)
            test_csv['title'] = test_csv['title'].transform(to_str)
            test_csv['question'] = test_csv['question'].transform(to_str)
            test_csv['answer'] = test_csv['answer'].transform(to_str)
            train_csv['full data'] = train_csv[['title', 'question', 'answer']].apply(' '.join, axis=1)
            test_csv['full data'] = test_csv[['title', 'question', 'answer']].apply(' '.join, axis=1)
            print(train_csv['full data'])
            if doLower:
                train_csv['full data'] = train_csv['full data'].transform(str_lower)
                test_csv['full data'] = test_csv['full data'].transform(str_lower)
            train_csv['label'] = train_csv['label'].transform(label_process)
            test_csv['label'] = test_csv['label'].transform(label_process)
            print(train_csv['full data'])
            train_df = [[row['full data'], row['label']] for index,row in train_csv.iterrows()]
            test_df = [[row['full data'], row['label']] for index,row in test_csv.iterrows()]
            print(len(train_df))
        elif col_num == 2:
            train_csv = pd.read_csv(full_addr + '/train.csv', names=['label', 'data'])
            test_csv = pd.read_csv(full_addr + '/test.csv', names=['label', 'data'])
            if doLower:
                train_csv['data'] = train_csv['data'].transform(str_lower)
                test_csv['data'] = test_csv['data'].transform(str_lower)
            print(test_csv['data'].head())
            print(train_csv['label'])
            train_csv['label'] = train_csv['label'].transform(label_process)
            test_csv['label'] = test_csv['label'].transform(label_process)
            train_df = [[row['data'], row['label']] for index, row in train_csv.iterrows()]
            test_df = [[row['data'], row['label']] for index, row in test_csv.iterrows()]  
        np.random.shuffle(train_df)
        np.random.shuffle(test_df)
        with open(data_addr+path+'/pickles'+'train_basic.pickle', 'wb') as fw:
            pickle.dump(train_df, fw)
        with open(data_addr+path+'/pickles'+'test_basic.pickle', 'wb') as fa:
            pickle.dump(test_df, fa)
        
    else:
        with open(data_addr+path+'/pickles'+'train_basic.pickle', 'rb') as fw:
            train_df = pickle.load(fw)
        with open(data_addr+path+'/pickles'+'test_basic.pickle', 'rb') as fa:
            test_df = pickle.load(fa)

    def str_cleanup(str):
        return str.strip().split()

    def find_synonyms(word):
        syn = list()
        for synset in wordnet.synsets(word):
            for syn_word in synset.lemma_names():
                syn.append(syn_word)
        return list(OrderedDict.fromkeys(syn))

    if augment_data == True:
        if not os.path.isfile(data_addr + path + '/pickles' + 'train_augmented.pickle'):
            vocab_set = set()
            train_augmented_df = copy.deepcopy(train_df)
            for idx_out, (data, label) in enumerate(train_df):
                data = str_cleanup(data)
                syn_list = list()
                replaceable_len = 0
                for idx_in,word in enumerate(data):
                    tmp = find_synonyms(word)
                    if len(tmp) > 0:
                        replaceable_len += 1
                        syn_list.append([idx_in, tmp])
                replace_num = np.random.geometric(p=0.5)
                if replaceable_len >0 and replace_num > 0:
                    replace_list = np.random.choice(replaceable_len, replace_num)
                    data = np.array(data)
                    for num in replace_list:
                        replace_word_len = len(syn_list[num][1])
                        replace_word_num = np.random.geometric(p=0.5)
                        if len(syn_list[num][1]) > replace_word_len:
                            data[syn_list[num][0]] = syn_list[num][1][replace_word_num]
                        else:
                            data[syn_list[num][0]] = syn_list[num][1][-1]
                    train_augmented_df.append([' '.join(data), label])
            with open(data_addr+path+'/pickles'+'train_augmented.pickle', 'wb') as fw:
                pickle.dump(train_augmented_df, fw)
        else:
            with open(data_addr+path+'/pickles'+'train_augmented.pickle', 'rb') as fw:
                train_augmented_df = pickle.load(fw)
        return train_augmented_df, test_df
    return train_df, test_df


# Press the green button in the gutter to run the script.
def onehot_encode(batch):
    out = torch.zeros(len(batch), alphabet_length, char_length) # (128, 69, 1014)?
    out_label = []
    for idx,item in enumerate(batch):
        out_label.append(item[1])
        for idx_in, char in enumerate(item[0][:-1015:-1]):
            try:
                out[idx][alphabet_dic[char]][idx_in] = 1
            except KeyError:
                continue
    return torch.Tensor(out), torch.LongTensor(out_label)

def weights_init(m):
    if type(m) == nn.Conv1d or type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.05)

def save_checkpoint(epoch, model, opt, path):
    state = {
        'Epoch': epoch,
        'State_dict': model.state_dict(),
        'optimizer': opt.state_dict()
    }
    torch.save(state, path)

# ag, amazon, dbpedia, sogou -> 3 cols, label, title, data
# yahoo -> 4 cols, label, title, question, answer
# yelp -> 2 cols, label, data
# ag
net = Net(alphabet_length, char_length, 10, 0.5, 'small')
net.apply(weights_init)
dev = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('current device: ', dev)
net.to(dev)

num_epochs = 21
batch_size = 128
loss_func = nn.CrossEntropyLoss()
lr = 1e-2
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)
torch.manual_seed(99)
running_loss = 0
correct = 0
total = 0

if __name__ == '__main__':
    train_data, test_data = get_data("/yahoo_answers_csv", 4, False, True)
    one = int(len(train_data)/2)
    trainloader_list = []
    trainloader_list.append(torch.utils.data.DataLoader(train_data[:one], batch_size=batch_size, shuffle=True, collate_fn = onehot_encode))
    trainloader_list.append(torch.utils.data.DataLoader(train_data[one:], batch_size=batch_size, shuffle=True, collate_fn = onehot_encode))

    testloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn = onehot_encode)

    for epoch in range(num_epochs):
        if epoch != 0 and epoch%3 == 0:
            lr *= 0.5
            optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)

        running_loss = 0
        correct = 0
        total = 0
        net.train()
        trainloader = trainloader_list[epoch % 2]

        for idx, (x, y) in enumerate(iter(trainloader), 0):
            x, y = x.to(dev), y.to(dev)
            out = net(x)
            loss = loss_func(out, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            running_loss += loss.item()
            if idx % 50 == 49:
                correct = (torch.max(out, 1)[1] == y).sum().item()
                total = batch_size
                print('Training Accuracy: %d ' % (100.0 * correct / total))
                print('%d/%d' % (correct, total))
                print('RunningLoss %5d: %.3f' % (idx + 1, running_loss))
                running_loss = 0

        correct_eachbatch = 0
        total_eachbatch = 0
        correct = 0
        total = 0
        net.eval()
        if epoch == 10 or epoch == 20 or epoch == 31:
            print("epoch: ", epoch)
            with torch.no_grad():
                for (x, y) in iter(testloader):
                    x, y = x.to(dev), y.to(dev)
                    out = net.forward(x)
                    predicted = torch.max(out, 1)[1]
                    loss = loss_func(out, y)
                    print("test loss: ", loss.item())
                    correct_eachbatch += (predicted == y).sum().item()
                    correct += correct_eachbatch
                    print(len(y))
                    total_eachbatch += len(y)
                    print(total_eachbatch)
                    total += total_eachbatch
                    print('Accuracy for test_batch: %.3f %%' % (100.0 * correct_eachbatch / total_eachbatch))
                    print('--------------------------------')
                    correct_eachbatch = 0
                    total_eachbatch = 0
            print('Accuracy for test: %.3f %%' % (100.0 * correct / total))
            print('--------------------------------')
    correct_eachbatch = 0
    total_eachbatch = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for (x, y) in iter(testloader):
            x, y = x.to(dev), y.to(dev)
            out = net.forward(x)
            predicted = torch.max(out, 1)[1]
            loss = loss_func(out, y)
            print("test loss: ", loss.item())
            correct_eachbatch += (predicted == y).sum().item()
            correct += correct_eachbatch
            print(len(y))
            total_eachbatch += len(y)
            print(total_eachbatch)
            total += total_eachbatch
            print('Accuracy for test_batch: %.3f %%' % (100.0 * correct_eachbatch / total_eachbatch))
            print('--------------------------------')
            correct_eachbatch = 0
            total_eachbatch = 0
        
    print('Accuracy for test: %.3f %%' % (100.0 * correct / total))
    print('--------------------------------')
    save_checkpoint(num_epochs, net, optimizer, data_addr+'/model'+ '/yahoo_answers_basic_21epoch')
