In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv


In [2]:
# 파일 위치 탐색
os.listdir('../input/quora-insincere-questions-classification')

['sample_submission.csv', 'embeddings.zip', 'train.csv', 'test.csv']

In [3]:
# file unzip
!unzip ../input/quora-insincere-questions-classification/embeddings.zip -d embeddings

Archive:  ../input/quora-insincere-questions-classification/embeddings.zip
   creating: embeddings/GoogleNews-vectors-negative300/
   creating: embeddings/glove.840B.300d/
   creating: embeddings/paragram_300_sl999/
   creating: embeddings/wiki-news-300d-1M/
  inflating: embeddings/glove.840B.300d/glove.840B.300d.txt  
  inflating: embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
  inflating: embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec  
  inflating: embeddings/paragram_300_sl999/README.txt  
  inflating: embeddings/paragram_300_sl999/paragram_300_sl999.txt  


In [4]:
os.listdir('./embeddings')

['GoogleNews-vectors-negative300',
 'wiki-news-300d-1M',
 'glove.840B.300d',
 'paragram_300_sl999']

In [5]:
train_df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
df = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

In [6]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [7]:
df.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [8]:
# torchtext로 인한 경고 무시
import warnings
warnings.filterwarnings(action='ignore')

In [9]:
import torchtext
from nltk import word_tokenize

# 필드 정의
qid = text = torchtext.data.Field()
text = torchtext.data.Field(lower=True, batch_first=True, tokenize=word_tokenize, fix_length=70)
target = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True)

# 데이터셋 만들기
train = torchtext.data.TabularDataset(path='../input/quora-insincere-questions-classification/train.csv', format='csv',
                                      fields={'question_text': ('text',text),
                                              'target': ('target',target)})
test = torchtext.data.TabularDataset(path='../input/quora-insincere-questions-classification/test.csv', format='csv',
                                     fields={'qid': ('qid', qid),
                                             'question_text': ('text', text)})

In [10]:
# 단어 집합 생성
text.build_vocab(train, test, min_freq=3) #min_freq : 단어 집합에 추가 시 단어의 최소 등장 빈도 조건 추가
qid.build_vocab(test)

In [11]:
# 사전 훈련된 단어 임베딩 로드
from tqdm import tqdm, tqdm_notebook

glove = torchtext.vocab.Vectors('./embeddings/glove.840B.300d/glove.840B.300d.txt')
tqdm_notebook().pandas() # 프로그래스바

100%|█████████▉| 2196016/2196017 [04:54<00:00, 7452.87it/s]


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [12]:
text.vocab.set_vectors(glove.stoi, glove.vectors, dim = 300)
# tensor 컬렉션에서 vocab 인스턴스에 대한 벡터 설정
# 문자열 사전, 벡터, 차원

In [13]:
import random

random.seed(1234)
batch_size = 512
train_iter = torchtext.data.BucketIterator(dataset=train,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               sort=False)
# 모든 텍스트 작업을 일괄로 처리하고 단어를 인덱스 숫자로 변환

# Model

In [14]:
from torch import nn

class TextCNN(nn.Module):
    def __init__(self, lm, padding_idx, kernel_num=128, fixed_length=100, kernel_size=[3,4,5], dropout=0.2):
        super(TextCNN, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding.from_pretrained(lm)
        #if static:
            #self.embedding.weight.requires_grad = False
        self.embedding.padding_idx = padding_idx
        self.conv = nn.ModuleList([nn.Conv2d(1, kernel_num, (i, self.embedding.embedding_dim)) for i in kernel_size])
        self.maxpools = [nn.MaxPool2d((fixed_length+1-i,1)) for i in kernel_size]
        self.fc = nn.Linear(len(kernel_size)*kernel_num, 1)
        
    def forward(self, input):
        x = self.embedding(input).unsqueeze(1)  # B X Ci X H X W
        x = [self.maxpools[i](torch.tanh(cov(x))).squeeze(3).squeeze(2) for i, cov in enumerate(self.conv)]  # B X Kn
        x = torch.cat(x, dim=1)  # B X Kn * len(Kz)
        y = self.dropout(x)
        logit = self.fc(y)
        return logit

In [15]:
def search_best_f1(true, pred):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in np.arange(0.1, 0.501, 0.01):
        tmp[1] = f1_score(true, np.array(pred)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    return tmp[2], delta

def training(epoch, model, loss_func, optimizer, train_iter):
    e = 0
    
    while e in range(epoch):
        train_iter.init_epoch()
        losses, preds, true = [], [], []
        
        for train_batch in tqdm(list(iter(train_iter)), 'epcoh {} training'.format(e)):
            model.train()
            x = train_batch.text.cuda()
            y = train_batch.target.type(torch.Tensor).cuda()
            true.append(train_batch.target.numpy())
            model.zero_grad()
            pred = model.forward(x).view(-1)
            loss = loss_function(pred, y)
            preds.append(torch.sigmoid(pred).cpu().data.numpy())
            losses.append(loss.cpu().data.numpy())
            loss.backward()
            optimizer.step()
        train_f1, alpha_train = search_best_f1([j for i in true for j in i], [j for i in preds for j in i])
        print('epcoh {:02} - train_loss {:.4f} - train f1 {:.4f} - delta {:.4f}'.format(
                            e, np.mean(losses), train_f1, alpha_train))
                
        e += 1
    return alpha_train

In [16]:
# Weight initialization, default xavier
# (if not initialized, the default random weight will be particularly large, which will affect model training)
def init_network(model, method='xavier', exclude='embedding', seed=123):
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed_all(seed)
    for name, w in model.named_parameters(): # Iterate all the trainable parameters in the network
        if exclude not in name: # Exclude parameters whose names contain specified keywords (the default is'embedding')
            if 'weight' in name: # Initialize the weights
                if method is 'xavier':
                    nn.init.xavier_normal_(w) # Call different initialization methods
                elif method is 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name: # Initialize the offset
                nn.init.constant_(w, 0.0)
            else:  # Skip other parameters except weight and bias
                pass

def print_model(model, ignore='embedding'):
    total = 0
    for name, w in model.named_parameters():
        if not ignore or ignore not in name:
            total += w.nelement()
            print('{} : {}  {} parameters'.format(name, w.shape, w.nelement()))
    print('-------'*4)
    print('Total {} parameters'.format(total))

In [17]:
import torch
from torch import optim

text.fix_length = 70
model = TextCNN(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], kernel_size=[1, 2, 3, 5], kernel_num=128, fixed_length=text.fix_length, dropout=0.2).cuda()
init_network(model)
optimizer = optim.Adam(params=model.parameters(), lr=1e-3)
loss_function = nn.BCEWithLogitsLoss()
print_model(model, ignore=None)

embedding.weight : torch.Size([85128, 300])  25538400 parameters
conv.0.weight : torch.Size([128, 1, 1, 300])  38400 parameters
conv.0.bias : torch.Size([128])  128 parameters
conv.1.weight : torch.Size([128, 1, 2, 300])  76800 parameters
conv.1.bias : torch.Size([128])  128 parameters
conv.2.weight : torch.Size([128, 1, 3, 300])  115200 parameters
conv.2.bias : torch.Size([128])  128 parameters
conv.3.weight : torch.Size([128, 1, 5, 300])  192000 parameters
conv.3.bias : torch.Size([128])  128 parameters
fc.weight : torch.Size([1, 512])  512 parameters
fc.bias : torch.Size([1])  1 parameters
----------------------------
Total 25961825 parameters


In [18]:
from sklearn.metrics import f1_score
alpha = training(3, model, loss_function, optimizer, train_iter)


epcoh 0 training:   0%|          | 0/2552 [00:00<?, ?it/s][A
epcoh 0 training:   0%|          | 1/2552 [00:00<39:06,  1.09it/s][A
epcoh 0 training:   0%|          | 6/2552 [00:01<27:35,  1.54it/s][A
epcoh 0 training:   0%|          | 11/2552 [00:01<19:34,  2.16it/s][A
epcoh 0 training:   1%|          | 17/2552 [00:01<13:55,  3.04it/s][A
epcoh 0 training:   1%|          | 23/2552 [00:01<09:58,  4.23it/s][A
epcoh 0 training:   1%|          | 29/2552 [00:01<07:12,  5.83it/s][A
epcoh 0 training:   1%|▏         | 35/2552 [00:01<05:17,  7.93it/s][A
epcoh 0 training:   2%|▏         | 41/2552 [00:01<03:56, 10.62it/s][A
epcoh 0 training:   2%|▏         | 47/2552 [00:01<03:00, 13.92it/s][A
epcoh 0 training:   2%|▏         | 53/2552 [00:01<02:20, 17.78it/s][A
epcoh 0 training:   2%|▏         | 59/2552 [00:02<01:52, 22.07it/s][A
epcoh 0 training:   3%|▎         | 65/2552 [00:02<01:33, 26.55it/s][A
epcoh 0 training:   3%|▎         | 71/2552 [00:02<01:20, 30.97it/s][A
epcoh 0 training

epcoh 00 - train_loss 0.1140 - train f1 0.6265 - delta 0.2900



epcoh 1 training:   0%|          | 0/2552 [00:00<?, ?it/s][A
epcoh 1 training:   0%|          | 5/2552 [00:00<00:52, 48.13it/s][A
epcoh 1 training:   0%|          | 10/2552 [00:00<00:53, 47.41it/s][A
epcoh 1 training:   1%|          | 15/2552 [00:00<00:52, 47.90it/s][A
epcoh 1 training:   1%|          | 20/2552 [00:00<00:52, 48.30it/s][A
epcoh 1 training:   1%|          | 25/2552 [00:00<00:52, 48.57it/s][A
epcoh 1 training:   1%|          | 30/2552 [00:00<00:51, 48.88it/s][A
epcoh 1 training:   1%|▏         | 35/2552 [00:00<00:51, 48.85it/s][A
epcoh 1 training:   2%|▏         | 40/2552 [00:00<00:51, 49.03it/s][A
epcoh 1 training:   2%|▏         | 45/2552 [00:00<00:51, 49.07it/s][A
epcoh 1 training:   2%|▏         | 50/2552 [00:01<00:50, 49.20it/s][A
epcoh 1 training:   2%|▏         | 55/2552 [00:01<00:50, 49.21it/s][A
epcoh 1 training:   2%|▏         | 60/2552 [00:01<00:50, 49.30it/s][A
epcoh 1 training:   3%|▎         | 65/2552 [00:01<00:50, 48.93it/s][A
epcoh 1 trainin

epcoh 01 - train_loss 0.1035 - train f1 0.6641 - delta 0.2900



epcoh 2 training:   0%|          | 0/2552 [00:00<?, ?it/s][A
epcoh 2 training:   0%|          | 5/2552 [00:00<00:51, 49.21it/s][A
epcoh 2 training:   0%|          | 10/2552 [00:00<00:53, 47.70it/s][A
epcoh 2 training:   1%|          | 15/2552 [00:00<00:52, 48.26it/s][A
epcoh 2 training:   1%|          | 20/2552 [00:00<00:52, 48.48it/s][A
epcoh 2 training:   1%|          | 26/2552 [00:00<00:51, 49.05it/s][A
epcoh 2 training:   1%|▏         | 32/2552 [00:00<00:51, 49.35it/s][A
epcoh 2 training:   1%|▏         | 38/2552 [00:00<00:50, 49.59it/s][A
epcoh 2 training:   2%|▏         | 44/2552 [00:00<00:50, 49.76it/s][A
epcoh 2 training:   2%|▏         | 49/2552 [00:00<00:50, 49.38it/s][A
epcoh 2 training:   2%|▏         | 54/2552 [00:01<00:50, 49.09it/s][A
epcoh 2 training:   2%|▏         | 59/2552 [00:01<00:51, 48.83it/s][A
epcoh 2 training:   3%|▎         | 64/2552 [00:01<00:50, 48.95it/s][A
epcoh 2 training:   3%|▎         | 69/2552 [00:01<00:50, 48.90it/s][A
epcoh 2 trainin

epcoh 02 - train_loss 0.0982 - train f1 0.6825 - delta 0.3100


# Prediction

In [19]:
def predict(model, test_list):
    pred = []
    with torch.no_grad():
        for test_batch in test_list:
            model.eval()
            x = test_batch.text.cuda()
            pred += torch.sigmoid(model.forward(x).view(-1)).cpu().data.numpy().tolist()
    return pred

In [20]:
test_list = list(torchtext.data.BucketIterator(dataset=test,
                                    batch_size=batch_size,
                                    sort=False,
                                    train=False))

In [21]:
preds = predict(model, test_list)
sub = pd.DataFrame()
sub['qid'] = [qid.vocab.itos[j] for i in test_list for j in i.qid.view(-1).numpy()]
sub['prediction'] = (preds > alpha).astype(int)

In [22]:
sub.head()

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0


In [23]:
sub.to_csv("submission.csv", index=False)