In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import nltk

class W2V(nn.Module):
    def __init__(self, word_size, projection_size, drop_prob):
        super(W2V, self).__init__()
        np.random.seed(99)
        self.fc_one = nn.Linear(word_size, projection_size)
        #self.dropout = nn.Dropout(p=drop_prob)
        self.relu = nn.ReLU()
        self.fc_two = nn.Linear(projection_size, word_size)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        out = self.fc_one(x)
        #out = self.dropout(out)
        out = self.relu(out)
        out = self.fc_two(out)
        return self.softmax(out)
    
    def get_vector(self, x):
        out = self.fc_one(x)
        return out

class Char2WV(nn.Module):
    def __init__(self, word_size, vocab_size, projection_size, drop_prob):
        super(Char2WV, self).__init__()
        np.random.seed(99)
        self.fc_one = nn.Linear(word_size, projection_size)
        #self.dropout = nn.Dropout(p=drop_prob)
        self.relu = nn.ReLU()
        self.fc_two = nn.Linear(projection_size, projection_size)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        out = self.fc_one(x)
        #out = self.dropout(out)
        out = self.relu(out)
        out = self.fc_two(out)
        return self.softmax(out)
    
    def get_vector(self, x):
        out = self.fc_one(x)
        return out

class Net(nn.Module):
    def __init__(self, lookup_table, lookup_char,kernel_size, num_filt, num_classes, drop_prob, mode='static'):
        super(Net, self).__init__()
        np.random.seed(99)
        self.mode = mode
        if mode == 'rand':
            ch_num = 1
            new_lookup_table = np.random.uniform(-1,1,lookup_table.shape).astype(np.float32)
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(new_lookup_table), freeze=True)
        elif mode == 'static':
            ch_num = 1
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=True)
        elif mode == 'non-static':
            ch_num = 1
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=False)
        elif mode == 'multi-channel':
            ch_num = 2
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=True)
            self.embedding_mult = nn.Embedding.from_pretrained(torch.from_numpy(lookup_char), freeze=True)
        self.convs = nn.ModuleList([nn.Conv2d(ch_num,num_filt,(size,100),1) for size in kernel_size])
        torch.manual_seed(99)
        for conv in self.convs:
            torch.nn.init.kaiming_uniform_(conv.weight)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
        self.fc = nn.Linear(len(kernel_size)*num_filt, num_classes)
        torch.nn.init.uniform_(self.fc.weight)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        embed_out = self.embedding(x).unsqueeze(1)
        if self.mode == 'multi-channel':
            embed_mult = self.embedding_mult(x).unsqueeze(1)
            embed_out = torch.cat((embed_out, embed_mult), 1)
        conv_out = [torch.max(self.relu(conv(embed_out)), dim=2)[0].squeeze(-1) for conv in self.convs]
        flatten = torch.cat(conv_out, dim=1)
        dropouts = self.dropout(flatten)
        out = self.fc(dropouts)
        return self.softmax(out)

class Net_Deep(nn.Module):
    def __init__(self, lookup_table, lookup_char,kernel_size, num_filt, num_classes, drop_prob, mode='static'):
        super(Net_Deep, self).__init__()
        np.random.seed(99)
        self.mode = mode
        if mode == 'rand':
            ch_num = 1
            new_lookup_table = np.random.uniform(-1,1,lookup_table.shape).astype(np.float32)
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(new_lookup_table), freeze=True)
        elif mode == 'static':
            ch_num = 1
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=True)
        elif mode == 'non-static':
            ch_num = 1
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=False)
        elif mode == 'multi-channel':
            ch_num = 2
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=True)
            self.embedding_mult = nn.Embedding.from_pretrained(torch.from_numpy(lookup_char), freeze=True)
        self.convs = nn.ModuleList([nn.Conv2d(ch_num,num_filt,(size,1),1) for size in kernel_size])
        self.maxpool = nn.MaxPool2d((2,1))
        self.convs2 = nn.ModuleList([nn.Conv2d(num_filt,num_filt,(size,1),1) for size in kernel_size])
        self.convs3 = nn.ModuleList([nn.Conv2d(num_filt,num_filt,(size,1),1) for size in kernel_size])
        self.convs4 = nn.ModuleList([nn.Conv2d(num_filt,num_filt,(size,100),1) for size in kernel_size])     
        if mode == 'multi-channel':
            self.wordchar_conv = nn.Conv2d(2, num_filt, (1,100), 1)
            self.wordchar_conv2 = nn.ModuleList([nn.Conv2d(num_filt,num_filt,(size,1),1) for size in kernel_size])
        elif mode == 'static':
            self.wordchar_conv = nn.Conv2d(1, num_filt, (1,100), 1)
            self.wordchar_conv2 = nn.ModuleList([nn.Conv2d(num_filt,num_filt,(size,1),1) for size in kernel_size])
        torch.manual_seed(99)
        for idx, conv in enumerate(self.convs):
            torch.nn.init.kaiming_uniform_(conv.weight)
            torch.nn.init.kaiming_uniform_(self.convs2[idx].weight)
            torch.nn.init.kaiming_uniform_(self.convs3[idx].weight)
            torch.nn.init.kaiming_uniform_(self.convs4[idx].weight)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
        self.fc = nn.Linear(len(kernel_size)*num_filt, num_classes)
        torch.nn.init.uniform_(self.fc.weight)
        self.fc2 = nn.Linear(len(kernel_size)*num_filt, num_classes)
        torch.nn.init.uniform_(self.fc2.weight)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        embed_out = self.embedding(x).unsqueeze(1)
        if self.mode == 'multi-channel':
            embed_mult = self.embedding_mult(x).unsqueeze(1)
            embed_out = torch.cat((embed_out, embed_mult), 1)
        conv_out = list()
        out = self.wordchar_conv(embed_out)
        conv_out = [torch.max(self.relu(conv(out)), dim=2)[0].squeeze(-1) for conv in self.wordchar_conv2]
        #for conv in self.convs:
            #conv_out.append(self.maxpool(self.relu(conv(embed_out)).permute(0,3,2,1)))
        #    conv_out.append(self.relu(conv(embed_out)).permute(0,3,2,1))
        #conv2_out = list()
        #for idx,conv in enumerate(self.convs2):
            #conv2_out.append(self.maxpool(self.relu(conv(conv_out[idx])).permute(0,3,2,1)))
        #    conv2_out.append(self.relu(conv(conv_out[idx])).permute(0,3,2,1))
        #conv3_out = list()
        #for idx,conv in enumerate(self.convs3):
        #    #conv3_out.append(self.maxpool(self.relu(conv(conv2_out[idx])).permute(0,3,2,1)))
        #    conv3_out.append(self.relu(conv(conv2_out[idx])).permute(0,3,2,1))
        #conv4_out = [torch.max(self.relu(conv(conv3_out[idx])), dim=2)[0].squeeze(-1) for idx,conv in enumerate(self.convs4)]
        flatten = torch.cat(conv_out, dim=1)
        dropouts = self.dropout(flatten)
        out = self.fc(dropouts)
        #out = self.fc2(out)
        #out = self.dropout(out)
        return self.softmax(out)

class Net_1D(nn.Module):
    def __init__(self, lookup_table, lookup_char, input_feature, input_length, num_classes, drop_prob, embed='multi-channel'):
        super(Net, self).__init__()
        np.random.seed(99)
        self.embed = embed

        if embed == 'static':
            ch_num = 1
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=True)
            l6_frame_length = int((input_length - 96)/27)
            self.conv = nn.Sequential(
                nn.Conv1d(input_feature, 256, 7, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3),

                nn.Conv1d(256, 256, 7, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3)
            )
        elif embed == 'multi-channel':
            ch_num = 2
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(lookup_table), freeze=True)
            self.embedding_mult = nn.Embedding.from_pretrained(torch.from_numpy(lookup_char), freeze=True)
            l6_frame_length = int((input_length - 96)/27)
            self.conv = nn.Sequential(
                nn.Conv1d(input_feature*2, 256, 7, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3),

                nn.Conv1d(256, 256, 7, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),

                nn.Conv1d(256, 256, 3, 1),
                nn.ReLU(),
                nn.MaxPool1d(3, 3)
            )
            self.fc = nn.Sequential(
                nn.Linear(l6_frame_length * 256, 1024),
                nn.Dropout(p=drop_prob),
                nn.Linear(1024, 1024),
                nn.Dropout(p=drop_prob),
                nn.Linear(1024, num_classes)
            )
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        embed_out = self.embedding(x).unsqueeze(1)
        if self.mode == 'multi-channel':
            embed_mult = self.embedding_mult(x).unsqueeze(1)
            embed_out = torch.cat((embed_out, embed_mult), 1)
        out = self.conv(x)
        out = out.view(len(x), -1)
        out = self.fc(out)
        return self.softmax(out)

In [None]:
!pip install gensim

In [None]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from gensim.models import Word2Vec
import nltk
from nltk.cluster import KMeansClusterer
from sklearn.cluster import KMeans
import pickle
import re
import matplotlib.pyplot as plt
import os
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import time

alphabet_dic = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6, 'h':7, 'i':8, 'j':9, 'k':10, 'l':11, 'm':12, 'n':13, 'o':14, 'p':15, 'q':16, 'r':17, 's':18, 't':19, 'u':20, 'v':21, 'w':22, 'x':23, 'y':24, 'z':25,
 '0':26, '1':27, '2':28, '3':29, '4':30, '5':31, '6':32, '7':33, '8':34, '9':35,
 "-":36, ',':37, ';':38, '.':39, '!':40, '?':41, ':':42, '\'':43, '"':44, '/':45, '\\':46, '|':47, '_':48, '@':49, '#':50, '$':51, '%':52, 'ˆ':53, '&':54, '*':55, '~':56, '`':57, '+':58,'=':59, '<':60, '>':61, '(':62, ')':63, '[':64, ']':65, '{':66, '}':67, '\n': 68}
data_addr = "/content/drive/MyDrive/Data"
def strCleanup(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def makeIdx(data, w2i):
    def get_idx(str):
        return w2i[str]
    for idx_sent, sent in enumerate(data):
        for idx, word in enumerate(sent):
            data[idx_sent][idx] = get_idx(word)

def batch_onehot(line):
    global embedding
    global vocablen
    inoutvector_list = list()
    # 0번째, 1번째, -1번째, -2번째 따로, 나머지 그대로
    tmp_vector = np.zeros((1,vocablen))
    tmp_vector[0][embedding[line[0]]] = 1
    inoutvector_list.append((tmp_vector, embedding[line[1]]))
    inoutvector_list.append((tmp_vector, embedding[line[2]]))

    tmp_vector = np.zeros((1,vocablen))
    tmp_vector[0][embedding[line[1]]] = 1
    inoutvector_list.append((tmp_vector, embedding[line[0]]))
    inoutvector_list.append((tmp_vector, embedding[line[2]]))
    inoutvector_list.append((tmp_vector, embedding[line[3]]))

    for idx,word in enumerate(line[2:-2]):
        tmp_vector = np.zeros((1,vocablen))
        tmp_vector[0][embedding[line[idx]]] = 1
        inoutvector_list.append((tmp_vector, embedding[line[idx-2]]))

        inoutvector_list.append((tmp_vector, embedding[line[idx-1]]))

        inoutvector_list.append((tmp_vector, embedding[line[idx+1]]))

        inoutvector_list.append((tmp_vector, embedding[line[idx+2]]))
    
    tmp_vector = np.zeros((1,vocablen))
    tmp_vector[0][embedding[line[-2]]] = 1
    inoutvector_list.append((tmp_vector, embedding[line[-4]]))
    inoutvector_list.append((tmp_vector, embedding[line[-3]]))
    inoutvector_list.append((tmp_vector, embedding[line[-1]]))

    tmp_vector = np.zeros((1,vocablen))
    tmp_vector[0][embedding[line[-1]]] = 1
    inoutvector_list.append((tmp_vector, embedding[line[-3]]))
    inoutvector_list.append((tmp_vector, embedding[line[-2]]))

    return inoutvector_list

def batch_onehot_window1(line):
    global embedding
    global vocablen
    invector_list = list()
    outvector_list = list()
    # 0번째, 1번째, -1번째, -2번째 따로, 나머지 그대로
    tmp_vector = np.zeros(vocablen)
    tmp_vector[embedding[line[0]]] = 1
    invector_list.append(tmp_vector)
    outvector_list.append(embedding[line[1]])

    for idx,word in enumerate(line[1:-1]):
        tmp_vector = np.zeros(vocablen)
        tmp_vector[embedding[line[idx]]] = 1

        invector_list.append(tmp_vector)
        outvector_list.append(embedding[line[idx-1]])

        invector_list.append(tmp_vector)
        outvector_list.append(embedding[line[idx+1]])

    tmp_vector = np.zeros(vocablen)
    tmp_vector[embedding[line[-1]]] = 1
    invector_list.append(tmp_vector)
    outvector_list.append(embedding[line[-2]])
    return invector_list, outvector_list

#char_to_char

def batch_onehot_window1_char(line):
    global alphabet_dic
    vec_line = list()

    char_len = len(alphabet_dic)
    invector_list = list()
    outvector_list = list()
    for idx,word in enumerate(line):
        tmp_vector = np.zeros(char_len)
        for alphabet in word:
            try:
                tmp_vector[alphabet_dic[alphabet]] = 1
            except KeyError:
                continue
        vec_line.append(tmp_vector)
    
    invector_list.append(vec_line[0])
    outvector_list.append(vec_line[1])

    for idx,word in enumerate(line[1:-1]):
        invector_list.append(vec_line[idx])
        outvector_list.append(vec_line[idx-1])

        invector_list.append(vec_line[idx])
        outvector_list.append(vec_line[idx+1])
    
    invector_list.append(vec_line[-1])
    outvector_list.append(vec_line[-2])

    return invector_list, outvector_list

def batch_onehot_window1_chartoword(line):
    global alphabet_dic
    global embedding
    global vocablen
    global w2v
    vec_line = list()

    char_len = len(alphabet_dic)
    invector_list = list()
    outvector_list = list()
    for idx,word in enumerate(line):
        tmp_vector = np.zeros(char_len)
        for alphabet in word:
            try:
                tmp_vector[alphabet_dic[alphabet]] = 1
            except KeyError:
                continue
        vec_line.append(tmp_vector)
    
    invector_list.append(vec_line[0])
    tmp_vector = np.zeros(vocablen)
    tmp_vector[embedding[line[1]]] = 1
    out_vec = w2v.get_vector(torch.FloatTensor(tmp_vector))
    out_vec = np.asarray(out_vec.detach().numpy(), dtype='float32')
    outvector_list.append(out_vec)

    for idx,word in enumerate(line[1:-1]):
        invector_list.append(vec_line[idx])
        tmp_vector = np.zeros(vocablen)
        tmp_vector[embedding[line[idx-1]]] = 1
        out_vec = w2v.get_vector(torch.FloatTensor(tmp_vector))
        out_vec = np.asarray(out_vec.detach().numpy(), dtype='float32')
        outvector_list.append(out_vec)

        invector_list.append(vec_line[idx])
        tmp_vector = np.zeros(vocablen)
        tmp_vector[embedding[line[idx+1]]] = 1
        out_vec = w2v.get_vector(torch.FloatTensor(tmp_vector))
        out_vec = np.asarray(out_vec.detach().numpy(), dtype='float32')
        outvector_list.append(out_vec)
    
    invector_list.append(vec_line[-1])
    tmp_vector = np.zeros(vocablen)
    tmp_vector[embedding[line[-2]]] = 1
    out_vec = w2v.get_vector(torch.FloatTensor(tmp_vector))
    out_vec = np.asarray(out_vec.detach().numpy(), dtype='float32')
    outvector_list.append(out_vec)

    return invector_list, outvector_list

def batchPad(batch):
    max_len = max([len(sent[0]) for sent in batch])
    if max_len <5:
        max_len = 5
    rows = []
    batch_labels = []
    for i in range(len(batch)):
        rows.append(np.pad(np.array(batch[i][0][:]), (0, max_len), 'constant', constant_values=1)[:max_len])
        batch_labels.append(batch[i][1])
        #print(batch_labels)
    return torch.LongTensor(np.concatenate(rows, axis=0).reshape(-1, max_len)), torch.LongTensor(batch_labels)

def batchPad_1d(batch):
    max_len = 20
    rows = []
    batch_labels = []
    for i in range(len(batch)):
        rows.append(np.pad(np.array(batch[i][0][:]), (0, max_len), 'constant', constant_values=1)[:max_len])
        batch_labels.append(batch[i][1])
        #print(batch_labels)
    return torch.LongTensor(np.concatenate(rows, axis=0).reshape(-1, max_len)), torch.LongTensor(batch_labels)

def weights_init(m):
    if type(m) == nn.Linear:
        nn.init.kaiming_uniform_(m.weight)

def save_checkpoint(epoch, model, opt, path):
    state = {
        'Epoch': epoch,
        'State_dict': model.state_dict(),
        'optimizer': opt.state_dict()
    }
    torch.save(state, path)

def MR_Preprocess(embed_kinds = 0):
    global w2v
    f_pos = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.pos", 'r', encoding='latin-1')
    f_neg = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.neg", 'r', encoding='latin-1')
    lines_pos = f_pos.readlines()
    lines_neg = f_neg.readlines()
    tokenized_pos = [strCleanup(line).split() for line in lines_pos]
    tokenized_neg = [strCleanup(line).split() for line in lines_neg]
    pos_label = [0] * len(tokenized_pos)
    neg_label = [1] * len(tokenized_neg)
    f_pos.close()
    f_neg.close()
    tokenized = tokenized_pos+tokenized_neg
    label = pos_label+neg_label

    embedding_dic = {}
    #embedding_set = set()
    for line in tokenized:
        for word in line:
            try:
                tmp = embedding_dic[word]
            except KeyError:
                embedding_dic[word] = len(embedding_dic)
        #embedding_set.update(line)
    return tokenized, label, embedding_dic


def MR_Preprocess_CNN(embedder, embed_kinds = 0):
    global embedding
    global projectionlen
    global vocablen
    f_pos = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.pos", 'r', encoding='latin-1')
    f_neg = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.neg", 'r', encoding='latin-1')
    lines_pos = f_pos.readlines()
    lines_neg = f_neg.readlines()
    tokenized_pos = [strCleanup(line).split() for line in lines_pos]
    tokenized_neg = [strCleanup(line).split() for line in lines_neg]
    pos_label = [0] * len(tokenized_pos)
    neg_label = [1] * len(tokenized_neg)
    f_pos.close()
    f_neg.close()
    tokenized = tokenized_pos+tokenized_neg
    label = pos_label+neg_label
    tokenized_set = []
    for sent in tokenized:
        tokenized_set.extend(sent)
    tokenized_set = list(set(tokenized_set))
    embedding_dic = {}
    noword = 0

    if not os.path.isfile("/content/drive/MyDrive/Data/rt-polaritydata/MR_dic_self.pickle"):
        new_dic = {}
        new_arr = list()
        if embed_kinds == 2:
            tmp_dic = {}
            for word,val in embedding.items():
                tmp_vector = np.zeros(vocablen)
                tmp_vector[val] = 1
                vec = embedder.get_vector(torch.FloatTensor(tmp_vector))
                coefs = np.asarray(vec.detach().numpy(), dtype='float32')
                tmp_dic[word] = (coefs/np.linalg.norm(coefs))
            embedding_arr = np.zeros((len(embedding)+2, projectionlen), dtype=np.float32)
            rand = np.random.uniform(-1,1,size = projectionlen)
            embedding_arr[0] = rand/np.linalg.norm(rand)
            rand = np.random.uniform(-1,1,size = projectionlen)
            embedding_arr[1] = rand/np.linalg.norm(rand)
            for idx, (word_in, vec_in) in enumerate(tmp_dic.items()):
                embedding_arr[idx+2,:] = vec_in
                embedding_dic[word_in] = idx+2
        with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_dic_self.pickle', 'wb') as fw:
            pickle.dump(embedding_dic,fw)
        with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_arr_self.pickle', 'wb') as fa:
            pickle.dump(embedding_arr,fa)
    else:
        with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_dic_self.pickle', 'rb') as fw:
            embedding_dic = pickle.load(fw)
        with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_arr_self.pickle', 'rb') as fa:
            embedding_arr = pickle.load(fa)
    print(embedding_dic)
    makeIdx(tokenized, embedding_dic)
    print(len(tokenized))
    print(len(tokenized[0]))
    tokenized_df = [[tokenized[i], label[i]] for i in range(len(tokenized))]
    print(embedding_arr.shape)
    return tokenized_df, tokenized_set, embedding_arr

def MR_Process_CharWord2channel(word_embedder, char_embedder):
    global embedding
    global projectionlen
    global vocablen
    global alphabet_dic
    f_pos = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.pos", 'r', encoding='latin-1')
    f_neg = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.neg", 'r', encoding='latin-1')
    lines_pos = f_pos.readlines()
    lines_neg = f_neg.readlines()
    tokenized_pos = [strCleanup(line).split() for line in lines_pos]
    tokenized_neg = [strCleanup(line).split() for line in lines_neg]
    pos_label = [0] * len(tokenized_pos)
    neg_label = [1] * len(tokenized_neg)
    f_pos.close()
    f_neg.close()
    tokenized = tokenized_pos+tokenized_neg
    label = pos_label+neg_label
    tokenized_set = []
    for sent in tokenized:
        tokenized_set.extend(sent)
    tokenized_set = list(set(tokenized_set))
    embedding_dic = {}
    embedding_dic_char = {}
    noword = 0

    embed_kinds = 2

    if embed_kinds == 2:
        tmp_dic = {}
        tmp_dic_char = {}
        for word,val in embedding.items():
            tmp_vector = np.zeros(vocablen)
            tmp_vector[val] = 1
            vec = word_embedder.get_vector(torch.FloatTensor(tmp_vector))
            coefs = np.asarray(vec.detach().numpy(), dtype='float32')
            tmp_dic[word] = (coefs/np.linalg.norm(coefs))
            
            tmp_vector_char = np.zeros(len(alphabet_dic))
            for alphabet in word:
                try:
                    tmp_vector_char[alphabet_dic[alphabet]] = 1
                except KeyError:
                    continue
            vec_char = char_embedder.get_vector(torch.FloatTensor(tmp_vector_char))
            coefs_char = np.asarray(vec_char.detach().numpy(), dtype='float32')
            tmp_dic_char[word] = (coefs_char/np.linalg.norm(coefs_char))

        embedding_arr = np.zeros((len(embedding)+2, projectionlen), dtype=np.float32)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_arr[0] = rand/np.linalg.norm(rand)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_arr[1] = rand/np.linalg.norm(rand)
        for idx, (word_in, vec_in) in enumerate(tmp_dic.items()):
            embedding_arr[idx+2,:] = vec_in
            embedding_dic[word_in] = idx+2
        
        embedding_char_arr = np.zeros((len(embedding)+2, projectionlen), dtype=np.float32)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_char_arr[0] = rand/np.linalg.norm(rand)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_char_arr[1] = rand/np.linalg.norm(rand)
        for idx, (word_in, vec_in) in enumerate(tmp_dic_char.items()):
            embedding_char_arr[idx+2,:] = vec_in

    print(embedding_dic)
    makeIdx(tokenized, embedding_dic)
    print(len(tokenized))
    print(len(tokenized[0]))
    tokenized_df = [[tokenized[i], label[i]] for i in range(len(tokenized))]
    print(embedding_arr.shape)
    return tokenized_df, tokenized_set, embedding_arr, embedding_char_arr

def MR_Preprocess_GlobalVec(embed_kinds = 1):
    global WordVec
    f_pos = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.pos", 'r', encoding='latin-1')
    f_neg = open("/content/drive/MyDrive/Data/rt-polaritydata/rt-polarity.neg", 'r', encoding='latin-1')
    lines_pos = f_pos.readlines()
    lines_neg = f_neg.readlines()
    tokenized_pos = [strCleanup(line).split() for line in lines_pos]
    tokenized_neg = [strCleanup(line).split() for line in lines_neg]
    pos_label = [0] * len(tokenized_pos)
    neg_label = [1] * len(tokenized_neg)
    f_pos.close()
    f_neg.close()
    tokenized = tokenized_pos+tokenized_neg
    label = pos_label+neg_label
    tokenized_set = []
    for sent in tokenized:
        tokenized_set.extend(sent)
    tokenized_set = list(set(tokenized_set))
    embedding_dic = {}
    noword = 0

    #if not os.path.isfile("/content/drive/MyDrive/Data/rt-polaritydata/MR_dic.pickle"):
    if embed_kinds is 0:
        wv_0 = Word2Vec(tokenized, size=100, window=5, min_count=1)
        embedding_arr = np.zeros((len(wv_0.wv.vocab)+2, 100), dtype=np.float32)
        for idx, (word,vec) in enumerate(zip(wv_0.wv.vocab, wv_0.wv.vectors)):
            coefs = np.asarray(vec, dtype='float32')
            embedding_arr[idx+2,:] = (coefs/np.linalg.norm(coefs))
            embedding_dic[word] = idx+2
    elif embed_kinds is 1:
        tmp_dic = {}
        for word,vec in zip(w2v.vocab, w2v.vectors):
            if word in tokenized_set:
                tmp_dic[word] = vec
        for word in tokenized_set:
            if tmp_dic.get(word) is None:
                tmp_dic[word] = np.random.uniform(-1, 1, size=300)
                noword +=1
        embedding_arr = np.zeros((len(tmp_dic)+2, 300), dtype=np.float32)
        print("PRE words: ",len(tokenized_set) - noword)
        rand = np.random.uniform(-1,1,size = 300)
        embedding_arr[0] = rand/np.linalg.norm(rand)
        rand = np.random.uniform(-1,1,size = 300)
        embedding_arr[1] = rand/np.linalg.norm(rand)
        for idx, (word, vec) in enumerate(tmp_dic.items()):
            coefs = np.asarray(vec, dtype='float32')
            embedding_arr[idx+2,:] = (coefs/np.linalg.norm(coefs))
            embedding_dic[word] = idx+2
    #    with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_dic.pickle', 'wb') as fw:
    #        pickle.dump(embedding_dic,fw)
    #    with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_arr.pickle', 'wb') as fa:
    #        pickle.dump(embedding_arr,fa)
    #else:
    #    with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_dic.pickle', 'rb') as fw:
    #        embedding_dic = pickle.load(fw)
    #    with open('/content/drive/MyDrive/Data/rt-polaritydata/MR_arr.pickle', 'rb') as fa:
    #        embedding_arr = pickle.load(fa)
    print(embedding_dic)
    makeIdx(tokenized, embedding_dic)
    print(len(tokenized))
    print(len(tokenized[0]))
    tokenized_df = [[tokenized[i], label[i]] for i in range(len(tokenized))]
    print(embedding_arr.shape)
    return tokenized_df, tokenized_set, embedding_arr

def SUBJ_Preprocess(embed_kinds = 0):
    global w2v
    f_sub = open("/content/drive/MyDrive/Data/rotten_imdb.tar/quote.tok.gt9.5000", 'r', encoding='latin-1')
    f_obj = open("/content/drive/MyDrive/Data/rotten_imdb.tar/plot.tok.gt9.5000", 'r', encoding='latin-1')
    lines_sub = f_sub.readlines()
    lines_obj = f_obj.readlines()
    tokenized_sub = [strCleanup(line).split() for line in lines_sub]
    tokenized_obj = [strCleanup(line).split() for line in lines_obj]
    sub_label = [0] * len(tokenized_sub)
    obj_label = [1] * len(tokenized_obj)
    f_sub.close()
    f_obj.close()
    tokenized = tokenized_sub+tokenized_obj
    label = sub_label+obj_label
    tokenized_set = []
    for sent in tokenized:
        tokenized_set.extend(sent)
    tokenized_set = list(set(tokenized_set))
    embedding_dic = {}
    for line in tokenized:
        for word in line:
            try:
                tmp = embedding_dic[word]
            except KeyError:
                embedding_dic[word] = len(embedding_dic)
        #embedding_set.update(line)
    return tokenized, label, embedding_dic

def SUBJ_Preprocess_CharWord2channel(word_embedder, char_embedder):
    global w2v
    f_sub = open("/content/drive/MyDrive/Data/rotten_imdb.tar/quote.tok.gt9.5000", 'r', encoding='latin-1')
    f_obj = open("/content/drive/MyDrive/Data/rotten_imdb.tar/plot.tok.gt9.5000", 'r', encoding='latin-1')
    lines_sub = f_sub.readlines()
    lines_obj = f_obj.readlines()
    tokenized_sub = [strCleanup(line).split() for line in lines_sub]
    tokenized_obj = [strCleanup(line).split() for line in lines_obj]
    sub_label = [0] * len(tokenized_sub)
    obj_label = [1] * len(tokenized_obj)
    f_sub.close()
    f_obj.close()
    tokenized = tokenized_sub+tokenized_obj
    label = sub_label+obj_label
    tokenized_set = []
    for sent in tokenized:
        tokenized_set.extend(sent)
    tokenized_set = list(set(tokenized_set))
    embedding_dic = {}
    embedding_dic_char = {}
    noword = 0

    embed_kinds = 2

    if embed_kinds == 2:
        tmp_dic = {}
        tmp_dic_char = {}
        for word,val in embedding.items():
            tmp_vector = np.zeros(vocablen)
            tmp_vector[val] = 1
            vec = word_embedder.get_vector(torch.FloatTensor(tmp_vector))
            coefs = np.asarray(vec.detach().numpy(), dtype='float32')
            tmp_dic[word] = (coefs/np.linalg.norm(coefs))
            
            tmp_vector_char = np.zeros(len(alphabet_dic))
            for alphabet in word:
                try:
                    tmp_vector_char[alphabet_dic[alphabet]] = 1
                except KeyError:
                    continue
            vec_char = char_embedder.get_vector(torch.FloatTensor(tmp_vector_char))
            coefs_char = np.asarray(vec_char.detach().numpy(), dtype='float32')
            tmp_dic_char[word] = (coefs_char/np.linalg.norm(coefs_char))

        embedding_arr = np.zeros((len(embedding)+2, projectionlen), dtype=np.float32)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_arr[0] = rand/np.linalg.norm(rand)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_arr[1] = rand/np.linalg.norm(rand)
        for idx, (word_in, vec_in) in enumerate(tmp_dic.items()):
            embedding_arr[idx+2,:] = vec_in
            embedding_dic[word_in] = idx+2
        
        embedding_char_arr = np.zeros((len(embedding)+2, projectionlen), dtype=np.float32)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_char_arr[0] = rand/np.linalg.norm(rand)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_char_arr[1] = rand/np.linalg.norm(rand)
        for idx, (word_in, vec_in) in enumerate(tmp_dic_char.items()):
            embedding_char_arr[idx+2,:] = vec_in

    print(embedding_dic)
    makeIdx(tokenized, embedding_dic)
    print(len(tokenized))
    print(len(tokenized[0]))
    tokenized_df = [[tokenized[i], label[i]] for i in range(len(tokenized))]
    print(embedding_arr.shape)
    return tokenized_df, tokenized_set, embedding_arr, embedding_char_arr

def TREC_Preprocess_CharWord2channel(word_embedder, char_embedder):
    f_train = open("/content/drive/MyDrive/Data/TREC/train_5500.label.txt", 'r', encoding='latin-1')
    f_test = open("/content/drive/MyDrive/Data/TREC/TREC_10.label.txt", 'r', encoding='latin-1')
    lines_train = f_train.readlines()
    lines_test = f_test.readlines()
    train_label = []
    test_label = []
    train_tokenized = []
    test_tokenized = []

    for line in lines_train:
        line = line.split(":")
        train_label.append(line[0])
        # ? is in every sentence so delete
        line = strCleanup(line[1]).split()[1:-1]
        train_tokenized.append(line)
    for line in lines_test:
        line = line.split(":")
        test_label.append(line[0])
        # ? is in every sentence so delete
        line = strCleanup(line[1]).split()[1:-1]
        test_tokenized.append(line)
    tokenized = train_tokenized+test_tokenized
    f_train.close()
    f_test.close()
    def make_label(cat):
        if cat == 'ABBR':
            return 0
        elif cat == 'ENTY':
            return 1
        elif cat == 'DESC':
            return 2
        elif cat == 'HUM':
            return 3
        elif cat == 'LOC':
            return 4
        elif cat == 'NUM':
            return 5
    label = np.array(train_label+test_label)
    lebelling = np.vectorize(make_label)
    label = lebelling(label)
    tokenized_set = []
    embedding_dic_char = {}
    embedding_dic = {}
    noword = 0

    embed_kinds = 2

    if embed_kinds == 2:
        tmp_dic = {}
        tmp_dic_char = {}
        for word,val in embedding.items():
            tmp_vector = np.zeros(vocablen)
            tmp_vector[val] = 1
            vec = word_embedder.get_vector(torch.FloatTensor(tmp_vector))
            coefs = np.asarray(vec.detach().numpy(), dtype='float32')
            tmp_dic[word] = (coefs/np.linalg.norm(coefs))
            
            tmp_vector_char = np.zeros(len(alphabet_dic))
            for alphabet in word:
                try:
                    tmp_vector_char[alphabet_dic[alphabet]] = 1
                except KeyError:
                    continue
            vec_char = char_embedder.get_vector(torch.FloatTensor(tmp_vector_char))
            coefs_char = np.asarray(vec_char.detach().numpy(), dtype='float32')
            tmp_dic_char[word] = (coefs_char/np.linalg.norm(coefs_char))

        embedding_arr = np.zeros((len(embedding)+2, projectionlen), dtype=np.float32)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_arr[0] = rand/np.linalg.norm(rand)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_arr[1] = rand/np.linalg.norm(rand)
        for idx, (word_in, vec_in) in enumerate(tmp_dic.items()):
            embedding_arr[idx+2,:] = vec_in
            embedding_dic[word_in] = idx+2
        
        embedding_char_arr = np.zeros((len(embedding)+2, projectionlen), dtype=np.float32)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_char_arr[0] = rand/np.linalg.norm(rand)
        rand = np.random.uniform(-1,1,size = projectionlen)
        embedding_char_arr[1] = rand/np.linalg.norm(rand)
        for idx, (word_in, vec_in) in enumerate(tmp_dic_char.items()):
            embedding_char_arr[idx+2,:] = vec_in

    print(embedding_dic)
    makeIdx(tokenized, embedding_dic)
    print(len(tokenized))
    print(len(tokenized[0]))
    train_df = list()
    test_df = list()
    for idx in range(len(tokenized)):
        if idx < len(train_tokenized):
            train_df.append([tokenized[idx], label[idx]])
        else:
            test_df.append([tokenized[idx], label[idx]])
    return train_df, test_df, tokenized_set, embedding_arr, embedding_char_arr

def TREC_Preprocess(embed_kinds = 0):
    f_train = open("/content/drive/MyDrive/Data/TREC/train_5500.label.txt", 'r', encoding='latin-1')
    f_test = open("/content/drive/MyDrive/Data/TREC/TREC_10.label.txt", 'r', encoding='latin-1')
    lines_train = f_train.readlines()
    lines_test = f_test.readlines()
    train_label = []
    test_label = []
    train_tokenized = []
    test_tokenized = []

    for line in lines_train:
        line = line.split(":")
        train_label.append(line[0])
        # ? is in every sentence so delete
        line = strCleanup(line[1]).split()[1:-1]
        train_tokenized.append(line)
    for line in lines_test:
        line = line.split(":")
        test_label.append(line[0])
        # ? is in every sentence so delete
        line = strCleanup(line[1]).split()[1:-1]
        test_tokenized.append(line)
    tokenized = train_tokenized+test_tokenized
    f_train.close()
    f_test.close()
    def make_label(cat):
        if cat == 'ABBR':
            return 0
        elif cat == 'ENTY':
            return 1
        elif cat == 'DESC':
            return 2
        elif cat == 'HUM':
            return 3
        elif cat == 'LOC':
            return 4
        elif cat == 'NUM':
            return 5
    label = np.array(train_label+test_label)
    lebelling = np.vectorize(make_label)
    label = lebelling(label)
    tokenized_set = []
    embedding_dic = {}
    for line in tokenized:
        for word in line:
            try:
                tmp = embedding_dic[word]
            except KeyError:
                embedding_dic[word] = len(embedding_dic)
    return tokenized, label, embedding_dic

MR_tokenized, MR_label, embedding = MR_Preprocess(0)
vocablen = len(embedding)

print(vocablen)
projectionlen = 100
window_size = 2 #앞 뒤로 2개씩
#print(len(AG_tokenized))
print(len(embedding))

w2v = W2V(vocablen, projectionlen, 0.5)
checkpoint_word = torch.load("/content/drive/MyDrive/model/w2v_MR.pt")
w2v.load_state_dict(checkpoint_word['State_dict'])
dev = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('current device', dev)
num_epochs = 10
batch_size = 64
running_loss = 0
correct = 0
total = 0

loss_func = nn.CrossEntropyLoss()
loss_func_reg = nn.MSELoss()
#parameters = filter(lambda p: p.requires_grad, w2v.parameters())
#w2v.apply(weights_init)
#w2v.to(dev)
checkpoint = torch.load("/content/drive/MyDrive/model/char2v(chartochar)_MR.pt")
#char2v = Char2WV(len(alphabet_dic), vocablen, projectionlen, 0.5)
char2v = W2V(len(alphabet_dic), projectionlen, 0.5)
parameters = filter(lambda p: p.requires_grad, char2v.parameters())
#char2v.apply(weights_init)
char2v.load_state_dict(checkpoint['State_dict'])
#char2v.to(dev)
#tokenized_df, tokenized_set, lookup_table = MR_Preprocess_CNN(char2v, 2)
#opt = optim.SGD(parameters, lr = 1e-2, momentum = 0.9)

tokenized_df, tokenized_set, lookup_table_word, lookup_table_char = MR_Process_CharWord2channel(w2v, char2v)
#tokenized_df, tokenized_set, lookup_table_word, lookup_table_char = SUBJ_Preprocess_CharWord2channel(w2v, char2v)
#tokenized_Train, tokenized_Test, tokenized_set, lookup_table_word, lookup_table_char = TREC_Preprocess_CharWord2channel(w2v, char2v)
#tokenized_Train = tokenized_df[:4500]+tokenized_df[5000:9500]
#tokenized_Test = tokenized_df[4500:5000]+tokenized_df[9500:]
#parameters = filter(lambda p: p.requires_grad, char2v.parameters())
#char2v.apply(weights_init)

opt = optim.SGD(parameters, lr = 1e-2, momentum = 0.9)

running_loss = 0
correct = 0
total = 0
#data_addr = "/content/drive/MyDrive/Data_CharacterConvNet"
#WordVec = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin', binary=True)

#save_checkpoint(num_epochs, w2v, optimizer,"/content/drive/MyDrive/model/w2v_MR.pt")
#checkpoint = torch.load("/content/drive/MyDrive/model/w2v_MR.pt", map_location=torch.device('cpu'))
#w2v.load_state_dict(checkpoint['State_dict'])
#opt.load_state_dict(checkpoint['optimizer'])
#tokenized_df, tokenized_set, lookup_table = MR_Preprocess_CNN(w2v, 2)
tokenized_Train = tokenized_df[:5000]+tokenized_df[5331:10331]
tokenized_Test = tokenized_df[5000:5331]+tokenized_df[10331:]


net = Net_Deep(lookup_table_word, lookup_table_char, [3,4,5],100, 2, 0.5, mode='multi-channel')

# HyperParameters

net.to(dev)
num_epochs = 100
batch_size = 64
param = filter(lambda p: p.requires_grad, net.parameters())
optimizer = optim.Adadelta(param)

torch.manual_seed(99)
running_loss = 0
correct = 0
total = 0

trainloader = torch.utils.data.DataLoader(tokenized_Train, batch_size=batch_size, shuffle=True, collate_fn=batchPad)
testloader = torch.utils.data.DataLoader(tokenized_Test, batch_size=batch_size, shuffle= True, collate_fn=batchPad)

start = time.time()
for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    for idx, (x, y) in enumerate(iter(trainloader), 0):
        net.train()
        x, y = x.to(dev), y.to(dev)

        out = net(x)
        loss = loss_func(out, y)
        loss.backward()
        nn.utils.clip_grad_norm_(param, max_norm=3)
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        if idx % 50 == 49:
            correct = (torch.max(out, 1)[1] == y).sum().item()
            total = batch_size
            print('Training Accuracy: %d ' % (100.0 * correct / total))
            print('%d/%d' % (correct, total))
            print('RunningLoss %5d: %.3f' % (idx + 1, running_loss))

print("training time :", time.time() - start)
correct_eachbatch = 0
total_eachbatch = 0
correct = 0
total = 0
net.eval()
with torch.no_grad():
    for (x, y) in iter(testloader):
        x, y = x.to(dev), y.to(dev)
        out = net.forward(x)
        predicted = torch.max(out, 1)[1]
        loss = loss_func(out, y)
        print("test loss: ", loss.item())
        correct_eachbatch += (predicted == y).sum().item()
        correct += correct_eachbatch
        total_eachbatch += len(y)
        total += total_eachbatch
        correct_eachbatch = 0
        total_eachbatch = 0
        print('Accuracy for test_batch: %.3f %%' % (100.0 * correct / total))
        print('--------------------------------')

print('Accuracy for test: %.3f %%' % (100.0 * correct / total))
print('--------------------------------')
"""

for epoch in range(num_epochs):
    for idx, line in enumerate(MR_tokenized):
        if len(line) < 3:
            continue
        in_list,out_list = batch_onehot_window1_chartoword(line)

        char2v.train()
        in_list, out_list = torch.FloatTensor(in_list), torch.from_numpy(np.array(out_list)).float()
        in_list, out_list = in_list.to(dev), out_list.to(dev)

        out = char2v(in_list)
        loss = loss_func_reg(out, out_list)
        loss.backward()
        opt.step()
        opt.zero_grad()
        running_loss += loss.item()
        if idx%100 == 0:
            print(idx)
    print('RunningLoss %5d: %.3f' %(epoch, running_loss))
    running_loss = 0
save_checkpoint(num_epochs, char2v, opt,"/content/drive/MyDrive/model/char2v(chartoword)_MR.pt")
"""
