In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.utils import to_categorical

In [None]:
%matplotlib inline
plt.style.use('ggplot')


In [None]:
df_train = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')

In [None]:
df_train.head()

In [None]:
from nltk.tokenize import RegexpTokenizer
from  bs4 import BeautifulSoup
regex = RegexpTokenizer(r'\w+')
import nltk
!nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')



In [None]:
def preprocess(text):
    s = []
    
    lower = text.lower()
    train_punkt = regex.tokenize(lower) 
    for j in train_punkt:
        if j not in stopword:
            s.append(j)
          
    if(len(s)==0):
        return "_nan_"
    return ' '.join(s)

In [None]:
df_text = df_train['question_text'].map(preprocess)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"


In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_text)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df_text)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)



In [None]:
word_counts = len(word_index)

In [None]:
from tensorflow.keras.layers import Bidirectional, Dense, LSTM,Embedding, Dropout
from tensorflow.keras.models import Sequential
def model():
    model = Sequential([
        Embedding(word_counts,100, input_length=max_length),
        Bidirectional(LSTM(150, return_sequences=True)),
        Dropout(0.2),
        Bidirectional(LSTM(100)),
        Dense(100, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    return model

In [None]:
model = model()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
labels = df_train['target'].values

In [None]:
model.evaluate(padded, labels, batch_size=32,verbose=1, epochs=6)

In [None]:
#pytorch deplot to get higher speed
word_counts = len(word_index)
max_length = 120


In [None]:
from torch import optim, nn
import torchvision
import torch.nn.functional as F
class deep4layers(nn.Module):
    def __init__(self ):
        super(deep4layers, self).__init__()
        self.embedding = nn.Embedding(word_counts, 100, max_length)
        self.dense1 = nn.Linear(12000,100)
        self.dense2 = nn.Linear(100,1)
        self.drop = nn.Dropout(0.2)
        self.bidirectional1 = nn.LSTM(100, 150, 1, bidirectional=True)
        self.bidirectional2 = nn.LSTM(300, 100, 1, bidirectional=False)
        self.flat = nn.Flatten()
    def forward(self,x):
        x = self.embedding(x)
        x = self.bidirectional1(x)
        x = self.drop(x[0])
        x = self.bidirectional2(x)
        x = self.flat(x[0])
        x = F.relu(self.dense1(x))
        x = F.sigmoid(self.dense2(x))
        return x



In [None]:
deep4layers = deep4layers().cuda()
loss_f = nn.BCELoss()
optimer = optim.Adam(params=deep4layers.parameters(), lr=0.01)

In [None]:
class Dataset():
    def __init__(self):
        self.x = padded
        self.y = labels
        self.len = len(labels)
    def __len__(self):
        return len(labels)
    def __getitem__(self, index):
        return self.x[index,:], self.y[index]


In [None]:
from torch.utils.data import DataLoader
dataloader = DataLoader(Dataset(), batch_size=32)

In [None]:
import torch
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
print(use_cuda)

In [None]:
from tqdm import tqdm
from torch.autograd import Variable

def train(epochs):
    loss_acc = []
    for i in range(epochs):
        loss_e = 0
        for x,y in tqdm(dataloader):
            x = Variable(x).cuda(device) #move tensor to cuda
            y = Variable(y).cuda(device) #move 130/3 faster
                
            ypred = deep4layers(x.long())
            loss = loss_f(ypred,y.float())
            optimer.zero_grad()
            loss.backward()
            optimer.step()
            loss_e = loss_e + loss
        loss_acc.append(loss_e)
    return loss_acc


In [None]:
train(10)

In [None]:
torch.cuda.reset_max_memory_allocated()