In [None]:
# _*_ coding: utf-8 _*_
# import
import numpy as np # linear algebra
import time
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import numpy as np
import os
from torchtext.data import Field, Dataset, Example
import pandas as pd
from sklearn.metrics import f1_score
from torchtext import data
print(os.listdir("../input/"))
from tqdm import tqdm
tqdm.pandas()
import operator 
import re 
import pickle as pkl
from sklearn.model_selection import train_test_split
device = 'cuda'

In [None]:
# const
MAX_Q_LEN = 128
BATCH_SIZE = 128
DOWNSAMPLE_RATE = 5 # ratio of label==0 to label==1

In [None]:
# read dataframe
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

In [None]:
# load embedding
from gensim.models import KeyedVectors

news_path = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)
embeddings_weight = torch.FloatTensor(embeddings_index.vectors)
embeddings_weight = torch.cat((embeddings_weight, (torch.sum(embeddings_weight, dim=0)/embeddings_weight.shape[0]).unsqueeze(0)), dim=0)

In [None]:
# clean functions
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

def remove_stopword(sentences):
    to_remove = ['a','to','of','and']
    return [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]

In [None]:
# clean
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_text(x))
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
test["question_text"] = test["question_text"].progress_apply(lambda x: clean_text(x))
test["question_text"] = test["question_text"].progress_apply(lambda x: clean_numbers(x))
test["question_text"] = test["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
# sentences = train["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
# sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
# vocab = build_vocab(sentences)

train_txt = remove_stopword(train["question_text"].progress_apply(lambda x: x.split()))
test_stc = remove_stopword(test["question_text"].progress_apply(lambda x: x.split()))
label = list(train["target"].progress_apply(lambda x: x))

In [None]:
# Dateset
def stc2idx(stc):
    m = np.zeros((MAX_Q_LEN), dtype=np.int)
    for i in range(len(stc)):
        try:
            m[i] = embeddings_index.vocab[stc[i]].index
        except:
            m[i] = embeddings_weight.shape[0]-1
    return m, len(stc)
                 
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, txt, label=None):
        self.txt = np.zeros((len(txt),MAX_Q_LEN),dtype=np.int)
        self.len = np.zeros((len(txt)), dtype=np.int)
        for i, stc in enumerate(txt):
            self.txt[i], self.len[i] = stc2idx(stc)
        if label:
            self.label = np.array(label, dtype=np.int)
        else:
            self.label = None
                 
    def __len__(self):
        return self.txt.shape[0]

    def __getitem__(self, idx):
        try:
            return self.txt[idx], self.len[idx], self.label[idx]
        except:
            return self.txt[idx], self.len[idx]

In [None]:
# dataset
label_1 = np.where(np.array(label) == 1)[0]
label_0 = np.where(np.array(label) == 0)[0]
label_0 = label_0[:min(DOWNSAMPLE_RATE*len(label_1), len(label_0))]
train_idx = np.concatenate((label_1, label_0))

train_stc = []
train_label = []
for idx in train_idx:
    train_stc.append(train_txt[idx])
    train_label.append(label[idx])
train_dataset = TextDataset(train_stc, train_label)
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)    

test_dataset = TextDataset(test_stc)
test_dataloader = torch.utils.data.dataloader.DataLoader(test_dataset, batch_size=2, shuffle=False)

In [None]:
class HighLinear(nn.Module):
    def __init__(self, txt_length, embeddings_length):
        super(HighLinear, self).__init__()
        
        self.txt_length = txt_length
        self.embeddings_length = embeddings_length
        self.embed = torch.nn.Embedding.from_pretrained(embeddings_weight, freeze=True)
        self.W_0 = torch.randn((embeddings_length, txt_length), device=device, requires_grad=True)
        self.b_0 = torch.randn((txt_length, txt_length), device=device, requires_grad=True)
        self.act_0 = nn.Tanh()
        self.avgpool = nn.AvgPool1d(kernel_size=txt_length)
        self.W_1 = torch.randn((txt_length, 2), device=device, requires_grad=True)
        self.b_1 = torch.randn((2), device=device, requires_grad=True)
        self.act_1 = nn.Softmax(dim=1)
    
    def forward(self, stc):
        tmp = self.act_0(self.embed(stc)@self.W_0+self.b_0)
        tmp = self.avgpool(tmp.squeeze()).squeeze()
#         print(tmp.shape)
        tmp = self.act_1(tmp@self.W_1+self.b_1)
        return tmp
    
    def parameters(self):
        return self.W_0, self.b_0, self.W_1, self.b_1

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        channels = embeddings_weight.shape[-1]
        kernel_size = 3 
        dropout_rate = 0
        num_layers = 1

        self.embed = torch.nn.Embedding.from_pretrained(embeddings_weight, freeze=True)
        self.CNN = torch.nn.Sequential(nn.Conv1d(in_channels=300, out_channels=300, kernel_size=1),
                                       nn.ReLU(),
                                       nn.MaxPool1d(128))
        self.fc0 = nn.Linear(in_features=300, out_features=2)
        self.softmax = nn.Softmax()
        
    def forward(self, stc):
        tmp = self.embed(stc).permute(0,2,1)
        tmp = self.CNN(tmp).squeeze()
        tmp = self.fc0(tmp)
        tmp = self.softmax(tmp)

        return tmp

In [None]:
class CNN_1(nn.Module):
    def __init__(self):
        super(CNN_1, self).__init__()
        channels = embeddings_weight.shape[-1]
        kernel_size = 3 
        dropout_rate = 0
        num_layers = 1

        self.embed = torch.nn.Embedding.from_pretrained(embeddings_weight, freeze=True)
        self.CNN = torch.nn.Sequential(nn.Conv1d(in_channels=300, out_channels=1200, kernel_size=1),
                                       nn.ReLU(),
                                       nn.MaxPool1d(128))
        self.fc = torch.nn.Sequential(nn.Linear(in_features=1200, out_features=300),
                                      nn.Sigmoid(),
                                      nn.Linear(in_features=300, out_features=2))
        self.softmax = nn.Softmax()
        
    def forward(self, stc):
        tmp = self.embed(stc).permute(0,2,1)
        tmp = self.CNN(tmp).squeeze()
#         print(tmp.shape)
        tmp = self.fc(tmp)
#         tmp = self.fc1(tmp)
        tmp = self.softmax(tmp)

        return tmp

In [None]:
model = CNN_1().to(device)

In [None]:
# train
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
CE = nn.CrossEntropyLoss()
factor = 10000
learning_rate = [1e-3]*3+[1e-4]*15+[1e-5]*2#+[1e-6]*factor+[1e-7]*factor+[1e-7]*factor
total_loss = []
start_time = time.time()
# model = nn.DataParallel(model)

for epoch in range(len(learning_rate)):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate[epoch])
    for stc, stc_len, target in train_dataloader:
        start_time = time.time()
        
        stc, stc_len = stc.to(device), stc_len.to(device)
        stc_len, sort_idx = torch.sort(stc_len, descending=True)
        if stc_len[-1] == 0:
            continue
        stc, target = stc[sort_idx], target[sort_idx]
#         pred = model(stc, stc_len)
        pred = model(stc)
        loss = CE(pred, target.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
#         total_loss.append(loss.detach().data.item())
#     print('time:', time.time()-start_time, ' epoch:', epoch, ' loss:',sum(total_loss)/len(total_loss))
#     if (epoch+1) % 10 == 0:
#         eval()
#         model.train()

In [None]:
# answer
ans = []
threshold = 0.5

for stc, stc_len in test_dataloader:
    stc = stc.to(device)
    pred = model(stc)

    ans += torch.argmax(pred, dim=1).detach().cpu().numpy().tolist()

In [None]:
# output
threshold = 0.5
tmp_ans = (np.array(ans)>threshold).astype(np.int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = tmp_ans
out_df.to_csv('submission.csv', index=False)