In [1]:
%matplotlib inline

In [2]:
# !pip uninstall transformers
!pip install transformers==3.5



In [3]:
import logging
import time
from platform import python_version
import random
from tqdm import tqdm
import re

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from sklearn.metrics import roc_auc_score
from torch.autograd import Variable

In [4]:
random_seed = 42

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
# torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
# torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [5]:
print("python version==%s" % python_version())
print("pandas==%s" % pd.__version__)
print("numpy==%s" % np.__version__)
print("torch==%s" % torch.__version__)
print("sklearn==%s" % sklearn.__version__)
print("transformers==%s" % transformers.__version__)
print("matplotlib==%s" % matplotlib.__version__)

python version==3.6.9
pandas==1.1.5
numpy==1.19.5
torch==1.7.0+cu101
sklearn==0.22.2.post1
transformers==3.5.0
matplotlib==3.2.2


In [6]:
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# df = pd.read_csv('drive/MyDrive/train.csv')
df = pd.read_csv('drive/MyDrive/reviews.csv')
df.shape

(358957, 6)

In [9]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,923,0,3.0,-1,2014-12-08,The food at snack is a selection of popular Gr...
1,924,0,3.0,-1,2013-05-16,This little place in Soho is wonderful. I had ...
2,925,0,4.0,-1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,926,0,4.0,-1,2011-07-28,This is a beautiful quaint little restaurant o...
4,927,0,4.0,-1,2010-11-01,Snack is great place for a casual sit down lu...


In [10]:
new_label_lst = []
for ele in df.label:
    if ele == -1:
        new_label_lst.append(0)
    else:
        new_label_lst.append(1)

In [11]:
df.drop(['label'], axis=1)
df['label'] = new_label_lst

In [12]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,923,0,3.0,0,2014-12-08,The food at snack is a selection of popular Gr...
1,924,0,3.0,0,2013-05-16,This little place in Soho is wonderful. I had ...
2,925,0,4.0,0,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,926,0,4.0,0,2011-07-28,This is a beautiful quaint little restaurant o...
4,927,0,4.0,0,2010-11-01,Snack is great place for a casual sit down lu...


In [13]:
df = df.sample(frac=1)
df = df.reset_index(drop=True)

In [14]:
df_train = df[:10000].reset_index(drop=True)
df_val = df[10000:11000].reset_index(drop=True)
df_test = df[11000:13000].reset_index(drop=True)

In [15]:
df_train.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,58479,251,4.0,1,2010-05-01,"Went on a Friday night at about 5:30 PM, it wa..."
1,39714,202,4.0,1,2014-10-26,"Nice ambiance, very nice staff and good food. ..."
2,101723,523,5.0,1,2013-11-17,we went to the city for a long weekend and rea...
3,33349,100,5.0,1,2010-07-12,Foods tasted: Walnut french toast Blueberry pa...
4,23432,305,5.0,1,2014-12-16,LOVE the new space. It wasn't too overly packe...


In [16]:
sentence = df_train.review[0].lower()
print(sentence)
sentence = re.sub('[\d]+', 'NUMBER', sentence)
sentence = re.sub('[^\dA-Za-z\s]+', '', sentence)
sentence = re.sub('[NUMBER]+', 'NUMBER', sentence)
print(sentence)

# print(sentence.split())

went on a friday night at about 5:30 pm, it was still early and plenty of seats were available.  its a very cool nondescript vibe in here, i liked it though.  for drinks i had the penicillin and the pomegranate sour, the former was tart and sweet. it was the better of the two.  husband had the pickle juice martini and that was really good. you really get like a nice mild pickle flavor as the drink finishes in your mouth. for food we just had a couple of appetizers: the hamachi with wasabi cream and soybeans and the pork buns.  both were delicious especially the pork buns.  i doused them with a liberal amount of sircacha and it did not make it incredibly spicy. it just added to the richness of the flavors.  very enjoyable cool place and i am looking forward to coming back here.
went on a friday night at about NUMBER pm it was still early and plenty of seats were available  its a very cool nondescript vibe in here i liked it though  for drinks i had the penicillin and the pomegranate sou

In [17]:
def make_word_dict(df):
    real_review_dict = dict()
    fake_review_dict = dict()

    for i in tqdm(range(df.shape[0])):
        # word_lst = df.review[i].lower().split()
        
        sentence = df.review[i].lower()
        sentence = re.sub('[\d]+', 'NUMBER', sentence)
        sentence = re.sub('[^A-Za-z\s]', '', sentence)
        sentence = re.sub('[NUMBER]+', 'NUMBER', sentence)
        word_lst = sentence.split()

        if df.label[i] == 1:
            for ele in word_lst:
                if ele in real_review_dict:
                    real_review_dict[ele] += 1
                else:
                    real_review_dict[ele] = 1

        else:
            for ele in word_lst:
                if ele in fake_review_dict:
                    fake_review_dict[ele] += 1
                else:
                    fake_review_dict[ele] = 1

    return real_review_dict, fake_review_dict

In [18]:
real_review_dict, fake_review_dict = make_word_dict(df_train)

100%|██████████| 10000/10000 [00:01<00:00, 8957.09it/s]


In [19]:
def make_del_word_lst(threshold, fake_review_dict, real_review_dict):
  del_word_lst = []

  for ele in fake_review_dict:
    if ele in real_review_dict:
      real_cnt = real_review_dict[ele]
      fake_cnt = fake_review_dict[ele]

      if real_cnt < fake_cnt:
        real_cnt, fake_cnt = (fake_cnt, real_cnt)
      
      if (fake_cnt / real_cnt) >= threshold:
        del_word_lst.append(ele)

  return del_word_lst

In [20]:
del_word_lst = make_del_word_lst(0.01, fake_review_dict, real_review_dict)

In [21]:
def make_review_lst(df, del_word_lst):
  new_review_lst = []
  for sentence in tqdm(df.review):
    sentence = sentence.lower()
    
    sentence = re.sub('[\d]+', '<NUM>', sentence)
    sentence = re.sub('[^A-Za-z\s]', '', sentence)
    sentence = re.sub('[<NUM>]+', '<NUM>', sentence)
    new_review_lst.append(''.join(map(lambda x: x+ ' ' if x not in del_word_lst else '', sentence.split())))
  return new_review_lst

In [22]:
train_review_lst = make_review_lst(df_train, del_word_lst)
val_review_lst = make_review_lst(df_val, del_word_lst)
test_review_lst = make_review_lst(df_test, del_word_lst)

100%|██████████| 10000/10000 [00:17<00:00, 558.73it/s]
100%|██████████| 1000/1000 [00:01<00:00, 567.73it/s]
100%|██████████| 2000/2000 [00:03<00:00, 561.17it/s]


In [23]:
# 필요 없다고 생각되는 col들 삭제
df_train = df_train.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)
df_val = df_val.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)
df_test = df_test.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)
df_train.head()

Unnamed: 0,rating,label
0,4.0,1
1,4.0,1
2,5.0,1
3,5.0,1
4,5.0,1


In [24]:
df_train['review'] = train_review_lst
df_val['review'] = val_review_lst
df_test['review'] = test_review_lst

In [25]:
df_train.head()

Unnamed: 0,rating,label,review
0,4.0,1,<NUM> nondescript penicillin martini finishes ...
1,4.0,1,
2,5.0,1,croquets
3,5.0,1,walnut switched
4,5.0,1,grit <NUM>


In [26]:
# df.comment_text[0]
df_train.review[0]

'<NUM> nondescript penicillin martini finishes hamachi soybeans liberal sircacha richness '

In [None]:
# target_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# df.iloc[[103]][target_columns]

target_columns = ["label"]
df.iloc[[103]][target_columns]

In [None]:
df_train.shape

In [None]:
df_val.shape

In [None]:
df_test.shape

In [None]:
model_class = transformers.BertModel
tokenizer_class = transformers.BertTokenizer
pretrained_weights='bert-base-uncased'

In [None]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
bert_model = model_class.from_pretrained(pretrained_weights)

In [None]:
max_seq = 30

In [None]:
def tokenize_text(df, max_seq):
    return [
        # tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.comment_text.values
        tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.review.values
    ]


def pad_text(tokenized_text, max_seq):
    return np.array([el + [0] * (max_seq - len(el)) for el in tokenized_text])


def tokenize_and_pad_text(df, max_seq):
    tokenized_text = tokenize_text(df, max_seq)
    padded_text = pad_text(tokenized_text, max_seq)
    return torch.tensor(padded_text)


def targets_to_tensor(df, target_columns):
    return torch.tensor(df[target_columns].values, dtype=torch.float32)

In [None]:
lis = tokenize_text(df_train, max_seq)

In [None]:
print(lis)

In [None]:
train_indices = tokenize_and_pad_text(df_train, max_seq)
val_indices = tokenize_and_pad_text(df_val, max_seq)
test_indices = tokenize_and_pad_text(df_test, max_seq)

In [None]:
print(train_indices)

In [None]:
with torch.no_grad():
    x_train = bert_model(train_indices)[0]  # Models outputs are tuples
    x_val = bert_model(val_indices)[0]
    x_test = bert_model(test_indices)[0]

In [None]:
y_train = targets_to_tensor(df_train, target_columns)
y_val = targets_to_tensor(df_val, target_columns)
y_test = targets_to_tensor(df_test, target_columns)

In [None]:
x_train[0]

In [None]:
x_train[0].shape

In [None]:
y_train[0]

In [None]:
class KimCNN(nn.Module):
    def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout, static):
        super(KimCNN, self).__init__()

        V = embed_num
        D = embed_dim
        C = class_num
        Co = kernel_num
        Ks = kernel_sizes
        
        self.static = static
        self.embed = nn.Embedding(V, D)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(Ks) * Co, C)
        self.sigmoid = nn.Sigmoid()
        

    def forward(self, x):
        if self.static:
            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        output = self.sigmoid(logit)
        return output

In [None]:
embed_num = x_train.shape[1]
embed_dim = x_train.shape[2]
class_num = y_train.shape[1]
kernel_num = 3
kernel_sizes = [2, 3, 4]
dropout = 0.5
static = True

In [None]:
model = KimCNN(
    embed_num=embed_num,
    embed_dim=embed_dim,
    class_num=class_num,
    kernel_num=kernel_num,
    kernel_sizes=kernel_sizes,
    dropout=dropout,
    static=static,
)

In [None]:
n_epochs = 10
batch_size = 10
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCELoss()

In [None]:
def generate_batch_data(x, y, batch_size):
    i, batch = 0, 0
    for batch, i in enumerate(range(0, len(x) - batch_size, batch_size), 1):
        x_batch = x[i : i + batch_size]
        y_batch = y[i : i + batch_size]
        yield x_batch, y_batch, batch
    if i + batch_size < len(x):
        yield x[i + batch_size :], y[i + batch_size :], batch + 1
    if batch == 0:
        yield x, y, 1

In [None]:
train_losses, val_losses = [], []

for epoch in range(n_epochs):
    start_time = time.time()
    train_loss = 0

    model.train(True)
    for x_batch, y_batch, batch in generate_batch_data(x_train, y_train, batch_size):
        y_pred = model(x_batch)
        optimizer.zero_grad()
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= batch
    train_losses.append(train_loss)
    elapsed = time.time() - start_time

    model.eval() # disable dropout for deterministic output
    with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations
        val_loss, batch = 0, 1
        for x_batch, y_batch, batch in generate_batch_data(x_val, y_val, batch_size):
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            val_loss += loss.item()
        val_loss /= batch
        val_losses.append(val_loss)

    print(
        "Epoch %d Train loss: %.2f. Validation loss: %.2f. Elapsed time: %.2fs."
        % (epoch + 1, train_losses[-1], val_losses[-1], elapsed)
    )

In [None]:
plt.plot(train_losses, label="Training loss")
plt.plot(val_losses, label="Validation loss")
plt.legend()
plt.title("Losses")

In [None]:
model.eval() # disable dropout for deterministic output
with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations
    y_preds = []
    batch = 0
    for x_batch, y_batch, batch in generate_batch_data(x_test, y_test, batch_size):
        y_pred = model(x_batch)
        y_preds.extend(y_pred.cpu().numpy().tolist())
    y_preds_np = np.array(y_preds)

In [None]:
y_preds_np

In [None]:
y_test_np = df_test[target_columns].values

In [None]:
y_test_np[1000:]

In [None]:
auc_scores = roc_auc_score(y_test_np, y_preds_np, average=None)
df_accuracy = pd.DataFrame({"label": target_columns, "auc": auc_scores})
df_accuracy.sort_values('auc')[::-1]

In [None]:
positive_labels = df_train[target_columns].sum().sum()
positive_labels

In [None]:
all_labels = df_train[target_columns].count().sum()
all_labels

In [None]:
positive_labels/all_labels

In [None]:
df_test_targets = df_test[target_columns]
df_pred_targets = pd.DataFrame(y_preds_np.round(), columns=target_columns, dtype=int)
df_sanity = df_test_targets.join(df_pred_targets, how='inner', rsuffix='_pred')

In [None]:
df_sanity

In [None]:
df_test_targets.sum()

In [None]:
df_pred_targets.sum()

In [None]:
df_sanity[df_sanity.label > 0][['label', 'label_pred']]