In [1]:
import torch
from torch import nn
from torch import optim
import pandas as pd
import os
from gensim.models import Word2Vec
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import time

In [None]:
# configurations

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_file_label = '../input/ml2020spring-hw4/training_label.txt'
train_file_nolabel = '../input/ml2020spring-hw4/training_nolabel.txt'
test_file = '../input/ml2020spring-hw4/testing_data.txt'

w2v_path = '../input/w2v-model/w2v.model'
model_path = 'hw4.model'

sen_len = 20
fix_embedding = True
batch_size = 128
epoch = 20
lr = 0.001


In [None]:
class DataReader:
    '''
    class for reading data
    '''
    def __init__(self,train_file_label = 'data/training_label.txt',
                 train_file_nolabel = 'data/training_nolabel.txt',
                 test_file = 'data/testing_data.txt'):
        self.train_file_label = train_file_label
        self.train_file_nolabel = train_file_nolabel
        self.test_file = test_file
    
    def train_data_label(self):
        return self.load_train_data(self.train_file_label,labeled=True)

    def train_data_nolabel(self):
        return self.load_train_data(self.train_file_nolabel,labeled=False)

    def test_data(self):
        return self.load_test_data(self.test_file)
    
    def load_train_data(self,path,labeled=False):
        with open(path,'r') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
            if labeled:
                X = [line[2:] for line in lines]
                Y = [line[0] for line in lines]
                return X,Y
            else:
                X = lines
                return X

    def load_test_data(self,path):
        with open(path,'r') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(',')[1:] for line in lines[1:]]
            X = [''.join(line).split(' ') for line in lines]
            return X

In [None]:
# Reading data

dr = DataReader(train_file_label=train_file_label,train_file_nolabel=train_file_nolabel,test_file=test_file)
X_train_label,Y_train_label = dr.train_data_label()

In [None]:
class Preprocess():
    def __init__(self,sentences,sen_len,w2v_path):
        self.sentences = sentences
        self.sen_len = sen_len
        self.embedding = Word2Vec.load(w2v_path)
        self.embedding_dim = self.embedding.vector_size
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []

    def random_vector(self):
        vector = torch.empty(1,self.embedding_dim)
        torch.nn.init.uniform_(vector)
        return vector
        
    def add_embedding(self,word):
        # add word into embedding and give it random representation vector
        # word will be '<PAD>' or '<UNK>' ONLY
        vector = self.random_vector()
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(vector)
        self.embedding_matrix = torch.cat((self.embedding_matrix,vector),0)
        
    def make_embedding(self):
        print('Get embedding ...')
        for i, word in enumerate(self.embedding.wv.vocab):
#             print(f'get words #{i+1}',end='\r')
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        print(f'total words: {len(self.embedding_matrix)}')
        return self.embedding_matrix
    
    def pad_sequence(self,sentence):
        # make all sentences having the save length
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx['<PAD>'])
        assert len(sentence) == self.sen_len
        return sentence
    
    def sentence_word2idx(self):
        # change words in sentence to idx
        sentence_list = []
        for i,sen in enumerate(self.sentences):
#             print(f'sentence count #{i+1}', end='\r')
            sentence_idx = []
            for word in sen:
                if word in self.word2idx.keys():
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx['<UNK>'])
            # make all sentences having the same length
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)

    def labels_to_tensor(self,labels):
        y = [int(label) for label in labels]
        return torch.LongTensor(y)

In [None]:
# Preprocessing data

preprocess = Preprocess(X_train_label,sen_len=sen_len,w2v_path=w2v_path)
embedding = preprocess.make_embedding()
train_x = preprocess.sentence_word2idx()
train_y = preprocess.labels_to_tensor(Y_train_label)

In [None]:
from torch.utils.data import Dataset,DataLoader

class TwitterDataset(Dataset):
    def __init__(self,X,y):
        self.data = X
        self.label = y
        
    def __getitem__(self, index):
        if self.label is not None:
            return self.data[index],self.label[index]
        else:
            return self.data[index]

    def __len__(self):
        return len(self.data)

In [None]:
# split train valid set

X_train,X_val,y_train,y_val = train_x[:180000],train_x[180000:],train_y[:180000],train_y[180000:]
# X_train,X_val,y_train,y_val = train_x[:18000],train_x[18000:20000],train_y[:18000],train_y[18000:20000]

In [None]:
# data loader

train_dataset = TwitterDataset(X_train,y_train)
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=8)

val_dataset = TwitterDataset(X_val,y_val)
val_loader = DataLoader(val_dataset,batch_size=batch_size,shuffle=True,num_workers=8)

In [None]:
# Model define

class LSTM_Net(nn.Module):
    '''
    RRN
    '''
    def __init__(self,embedding,hidden_dim,num_layers,
                dropout=0.5,fix_embedding=True):
        super(LSTM_Net,self).__init__()
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        
        # make embedding layer
        self.embedding = nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True
        
        # define LSTM layer
        self.lstm = nn.LSTM(self.embedding_dim,hidden_dim,num_layers,batch_first=True)
        
        # define classifier, which is a fc nn
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim,150),
            nn.ReLU(),

            nn.Dropout(dropout),
            nn.Linear(150,150),
            nn.ReLU(),

            nn.Dropout(dropout),
            nn.Linear(150,150),
            nn.ReLU(),
            
            nn.Dropout(dropout),
            nn.Linear(150,1),
            nn.ReLU()
        )

    def forward(self,x):
        #print('input data: ',type(x),x.shape)
        inputs = self.embedding(x)
        #print('after embedding: ',type(inputs),inputs.shape)
        x, _ = self.lstm(inputs,None)
        #print('after lstm: ',type(x),x.shape)
        x = x[:,-1,:]
        #print('after -1: ',x.shape)
        out = self.classifier(x)
        #print('after classifier: ', out.shape)
        return out

In [None]:
model = LSTM_Net(embedding=embedding,hidden_dim=150,num_layers=1,dropout=0.5,fix_embedding=True)
model = model.to(device)

In [None]:
# trian method

import matplotlib.pyplot as plt

def evaluation(output,labels):
    '''
    return the number of right predictions
    '''
    output[output>=0.5] = 1
    output[output<0.5] = 0
    return torch.sum(torch.eq(output,labels)).item()

def training(batch_size,n_epoch,lr,train,valid,model,model_path,device,best_acc=0):
    start = time.time()
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad==True)
    print(f'Parameters total num:{total} trainable:{trainable}')
    
    model.train()
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(),lr=lr)
    t_len, v_len = (len(train),len(valid))
    total_loss, total_acc = 0,0
    
    train_loss_history =[]
    train_acc_history = []

    val_loss_history =[]
    val_acc_history = []
    
    for epoch in range(n_epoch):
        total_loss,total_acc = 0,0
        for i,(inputs,labels) in enumerate(train):
            inputs = inputs.to(device,dtype=torch.long)
            labels = labels.to(device,dtype=torch.float)
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            correct = evaluation(outputs,labels)
            total_acc += (correct/batch_size)
            total_loss += loss.item()
            print('Epoch {} {}/{} acc {:.3f} loss {:.5f}'.format(epoch+1,i+1,t_len,
                                                         correct*100/batch_size,total_loss),end='\r')
        print('\nTrain total acc {:.3f} total loss {:.5f}'.format(total_acc*100/t_len,total_loss/t_len))
        
        train_loss_history.append(total_loss/t_len)
        train_acc_history.append(total_acc*100/t_len)
        
        model.eval()
        with torch.no_grad():
            total_loss, total_acc = 0,0
            for i,(inputs,labels) in enumerate(valid):
                inputs = inputs.to(device,dtype=torch.long)
                labels = labels.to(device,dtype=torch.float)
                outputs = model(inputs)
                outputs = outputs.squeeze()
                loss = criterion(outputs,labels)
                correct = evaluation(outputs,labels)
                total_acc += (correct/batch_size)
                total_loss += loss.item()
            print('\nValid total acc {:.3f} total loss {:.5f}'.format(total_acc*100/v_len,total_loss/v_len))
            
            if total_acc > best_acc:
                best_acc = total_acc
                torch.save(model,model_path)
                print(f'Saving model with acc: {total_acc*100/v_len}')

        val_loss_history.append(total_loss/v_len)
        val_acc_history.append(total_acc*100/v_len)

        print('-------------------------------------------------------------------------')
        
        model.train()
    
    plt.figure()
    plt.plot(range(n_epoch),train_loss_history,range(n_epoch),val_loss_history)
    plt.legend(['train','val'])
    plt.title('loss')

    plt.figure()
    plt.plot(range(n_epoch),train_acc_history,range(n_epoch),val_acc_history)
    plt.legend(['val','val'])
    plt.title('acc')

    print('======================================================================')
    print(f'Best acc: {best_acc:.3f}')
    print(f'Training used time: {time.time()-start:2.2f}')

In [None]:
best_acc = 0

training(batch_size=batch_size,
        n_epoch=epoch,
        lr=lr,
        train=train_loader,
        valid=val_loader,
        model=model,
        model_path=model_path,
        device=device,
        best_acc=best_acc)

In [None]:
# semi-supervise
# self-trainning

X_train_nolabel = dr.train_data_nolabel()

preprocess = Preprocess(X_train_nolabel,sen_len,w2v_path)
embedding = preprocess.make_embedding()
train_nolabel_x = preprocess.sentence_word2idx()


# data loader
train_nolabel_dataset = TwitterDataset(X=train_nolabel_x,y=None)
train_nolabel_loader = DataLoader(train_nolabel_dataset,batch_size=batch_size,shuffle=False,num_workers=8)


def getSemiSuperviseData(test_loader, model, device, threshold=0.8):
    labeled_X = []
    label = []
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for inputs in test_loader:
            inputs = inputs.to(device,dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze().tolist()
            for i in range(len(outputs)):
                if outputs[i] >= threshold:
                    label.append(1)
                    labeled_X.append(inputs[i].cpu().tolist())
                elif outputs[i] < 1 - threshold:
                    label.append(0)
                    labeled_X.append(inputs[i].cpu().tolist())
#             break
    return labeled_X, label

# load model
model = torch.load(model_path)

labeled_X, label = getSemiSuperviseData(train_nolabel_loader,model,device,threshold=0.8)

print(f'New labeled train data: {len(labeled_X)} label: {len(label)}')

print('Train the model used new labeled data ...')


train_newlabeled_dataset = TwitterDataset(torch.cat((torch.tensor(labeled_X),X_train),0),torch.cat((torch.tensor(label),y_train),0))
train_newlabeled_loader = DataLoader(train_newlabeled_dataset,batch_size=batch_size,shuffle=True)

training(batch_size=batch_size,
        n_epoch=epoch,
        lr=lr,
        train=train_newlabeled_loader,
        valid=val_loader,
        model=model,
        model_path=model_path,
        device=device,
        best_acc=best_acc)

In [None]:
# Testing

# Reading data
X_test = dr.load_test_data(test_file)

preprocess = Preprocess(X_test,sen_len,w2v_path)
embedding = preprocess.make_embedding()
test_x = preprocess.sentence_word2idx()


# data loader
test_dataset = TwitterDataset(X=test_x,y=None)
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=8)

def testing(test_loader, model, device):
    outputs_list = []
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for inputs in test_loader:
            inputs = inputs.to(device,dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1
            outputs[outputs<0.5] = 0
            outputs_list += outputs.int().tolist()
    return outputs_list

# load model
model = torch.load(model_path)

test_res = testing(test_loader,model,device)

# saving results
df = pd.DataFrame({'id':[i for i in range(len(test_x))],
                  'label':test_res})

print('saving result ...')
df.to_csv('predict.csv',index=False)