In [1]:
%matplotlib inline

In [2]:
# !pip uninstall transformers
!pip install transformers==3.5



In [3]:
import logging
import time
from platform import python_version
import random
from tqdm import tqdm
import re

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from sklearn.metrics import roc_auc_score
from torch.autograd import Variable

In [4]:
random_seed = 42

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
# torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
# torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [5]:
print("python version==%s" % python_version())
print("pandas==%s" % pd.__version__)
print("numpy==%s" % np.__version__)
print("torch==%s" % torch.__version__)
print("sklearn==%s" % sklearn.__version__)
print("transformers==%s" % transformers.__version__)
print("matplotlib==%s" % matplotlib.__version__)

python version==3.6.9
pandas==1.1.5
numpy==1.19.5
torch==1.7.0+cu101
sklearn==0.22.2.post1
transformers==3.5.0
matplotlib==3.2.2


In [6]:
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# df = pd.read_csv('drive/MyDrive/train.csv')
df = pd.read_csv('drive/MyDrive/reviews.csv')
df.shape

(358957, 6)

In [9]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,923,0,3.0,-1,2014-12-08,The food at snack is a selection of popular Gr...
1,924,0,3.0,-1,2013-05-16,This little place in Soho is wonderful. I had ...
2,925,0,4.0,-1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,926,0,4.0,-1,2011-07-28,This is a beautiful quaint little restaurant o...
4,927,0,4.0,-1,2010-11-01,Snack is great place for a casual sit down lu...


In [10]:
def make_id_dict(df):
    user_id_dict = {}
    prod_id_dict = {}

    for idx in tqdm(range(df.shape[0])):
        if df.user_id[idx] in user_id_dict:
            user_id_dict[df.user_id[idx]] += 1
        else:
            user_id_dict[df.user_id[idx]] = 1

        if df.prod_id[idx] in prod_id_dict:
            prod_id_dict[df.prod_id[idx]] += 1        
        else:
            prod_id_dict[df.prod_id[idx]] = 1
        
    return user_id_dict, prod_id_dict

In [11]:
user_id_dict, prod_id_dict = make_id_dict(df)

100%|██████████| 358957/358957 [00:14<00:00, 25157.19it/s]


In [12]:
user_id_count = []
prod_id_count = []

for idx in tqdm(range(df.shape[0])):
    user_id_count.append(user_id_dict[df.user_id[idx]])
    prod_id_count.append(prod_id_dict[df.prod_id[idx]])

100%|██████████| 358957/358957 [00:07<00:00, 48552.43it/s]


In [13]:
df['user_id_count'] = user_id_count
df['prod_id_count'] = prod_id_count

In [14]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review,user_id_count,prod_id_count
0,923,0,3.0,-1,2014-12-08,The food at snack is a selection of popular Gr...,39,210
1,924,0,3.0,-1,2013-05-16,This little place in Soho is wonderful. I had ...,1,210
2,925,0,4.0,-1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,2,210
3,926,0,4.0,-1,2011-07-28,This is a beautiful quaint little restaurant o...,1,210
4,927,0,4.0,-1,2010-11-01,Snack is great place for a casual sit down lu...,5,210


In [15]:
new_label_lst = []
for ele in df.label:
    if ele == -1:
        new_label_lst.append(0)
    else:
        new_label_lst.append(1)

In [16]:
df.drop(['label'], axis=1)
df['label'] = new_label_lst

In [17]:
df.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review,user_id_count,prod_id_count
0,923,0,3.0,0,2014-12-08,The food at snack is a selection of popular Gr...,39,210
1,924,0,3.0,0,2013-05-16,This little place in Soho is wonderful. I had ...,1,210
2,925,0,4.0,0,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,2,210
3,926,0,4.0,0,2011-07-28,This is a beautiful quaint little restaurant o...,1,210
4,927,0,4.0,0,2010-11-01,Snack is great place for a casual sit down lu...,5,210


In [18]:
df = df.sample(frac=1, random_state=random_seed)
df = df.reset_index(drop=True)

In [19]:
is_real = df['label'] == 1
df_real = df[is_real]

is_fake = df['label'] == 0
df_fake = df[is_fake]

assert df.shape[0] == df_real.shape[0] + df_fake.shape[0]

In [20]:
df_train_real = df_real[:10000].reset_index(drop=True)
df_val_real = df_real[10000:11000].reset_index(drop=True)
df_test_real = df_real[11000:13000].reset_index(drop=True)

df_train_fake = df_fake[:10000].reset_index(drop=True)
df_val_fake = df_fake[10000:11000].reset_index(drop=True)
df_test_fake = df_fake[11000:13000].reset_index(drop=True)

In [21]:
df_train = pd.concat([df_train_real, df_train_fake])
df_val = pd.concat([df_val_real, df_val_fake])
df_test = pd.concat([df_test_real, df_test_fake])

In [22]:
df_train = df_train.sample(frac=1, random_state=random_seed).reset_index(drop=True)
df_val = df_val.sample(frac=1, random_state=random_seed).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=random_seed).reset_index(drop=True)

In [23]:
df_train.head()

Unnamed: 0,user_id,prod_id,rating,label,date,review,user_id_count,prod_id_count
0,85283,417,5.0,0,2008-10-20,i moved to greenpoint about a year ago and wa...,1,998
1,2612,7,2.0,1,2014-12-29,Usually I love this place. My parents ask to c...,1,677
2,12271,247,3.0,1,2010-09-18,"The ramen tastes like the ones in Japan, which...",26,7378
3,4872,613,3.0,1,2011-04-05,I like this place. I liked the bartender. I li...,21,369
4,138308,752,4.0,0,2010-12-29,"This place has great chicken and waffles, so g...",1,1217


In [24]:
sentence = df_train.review[0].lower()
print(sentence)
sentence = re.sub('[\d]+', 'NUMBER', sentence)
sentence = re.sub('[^\dA-Za-z\s]+', '', sentence)
sentence = re.sub('[NUMBER]+', 'NUMBER ', sentence)
print(sentence)

# print(sentence.split())

i moved to greenpoint  about a year ago and was a lil sad about the cafe situation in the area... glad this place has opened!!! i can get a kick ass coffee on the way to work and i admit i have been stopping by on my way home for oysters/cheese and their addictive sidecars. the staff is nice (hot) and it's a good addition to the hood.
i moved to greenpoint  about a year ago and was a lil sad about the cafe situation in the area glad this place has opened i can get a kick ass coffee on the way to work and i admit i have been stopping by on my way home for oysterscheese and their addictive sidecars the staff is nice hot and its a good addition to the hood


In [25]:
def make_word_dict(df):
    real_review_dict = dict()
    fake_review_dict = dict()

    for i in tqdm(range(df.shape[0])):
        # word_lst = df.review[i].lower().split()
        
        sentence = df.review[i].lower()
        sentence = re.sub('[\d]+', 'NUMBER', sentence)
        sentence = re.sub('[^A-Za-z\s]', '', sentence)
        sentence = re.sub('[NUMBER]+', 'NUMBER', sentence)
        word_lst = sentence.split()

        if df.label[i] == 1:
            for ele in word_lst:
                if ele in real_review_dict:
                    real_review_dict[ele] += 1
                else:
                    real_review_dict[ele] = 1

        else:
            for ele in word_lst:
                if ele in fake_review_dict:
                    fake_review_dict[ele] += 1
                else:
                    fake_review_dict[ele] = 1

    return real_review_dict, fake_review_dict

In [26]:
real_review_dict, fake_review_dict = make_word_dict(df_train)

100%|██████████| 20000/20000 [00:02<00:00, 9514.08it/s]


In [27]:
def make_del_word_lst(fake_review_dict, real_review_dict, threshold=0.8):
    del_word_lst = []
    sum_real_review = sum(real_review_dict.values())
    sum_fake_review = sum(fake_review_dict.values())

    for ele in fake_review_dict:
        if ele in real_review_dict:
            real_cnt = real_review_dict[ele] / sum_real_review
            fake_cnt = fake_review_dict[ele] / sum_fake_review

            if real_cnt < fake_cnt:
                real_cnt, fake_cnt = (fake_cnt, real_cnt)
      
            if (fake_cnt / real_cnt) >= threshold:
                del_word_lst.append(ele)

    return del_word_lst

In [28]:
del_word_lst = make_del_word_lst(fake_review_dict, real_review_dict, threshold=0.9)

In [29]:
len(del_word_lst)

1724

In [30]:
def make_review_lst(df, del_word_lst):
  new_review_lst = []
  for sentence in tqdm(df.review):
    sentence = sentence.lower()
    
    sentence = re.sub('[\d]+', '<NUM>', sentence)
    sentence = re.sub('[^A-Za-z\s]', '', sentence)
    # sentence = re.sub('[<NUM>]+', '<NUM>', sentence) # f you don't want to delete the number
    sentence = re.sub('[<NUM>]+', '', sentence) # f you want to delete the number
    new_review_lst.append(''.join(map(lambda x: x + ' ' if x not in del_word_lst else '', sentence.split())))
    
  return new_review_lst

In [31]:
train_review_lst = make_review_lst(df_train, del_word_lst)
val_review_lst = make_review_lst(df_val, del_word_lst)
test_review_lst = make_review_lst(df_test, del_word_lst)

100%|██████████| 20000/20000 [00:27<00:00, 731.31it/s]
100%|██████████| 2000/2000 [00:02<00:00, 748.94it/s]
100%|██████████| 4000/4000 [00:05<00:00, 755.78it/s]


In [32]:
# 필요 없다고 생각되는 col들 삭제
df_train = df_train.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)
df_val = df_val.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)
df_test = df_test.drop(['user_id', 'prod_id', 'date', 'review'], axis=1)

In [33]:
df_train['review'] = train_review_lst
df_val['review'] = val_review_lst
df_test['review'] = test_review_lst

In [34]:
df_train.head()

Unnamed: 0,rating,label,user_id_count,prod_id_count,review
0,5.0,0,1,998,moved greenpoint year ago lil cafe situation t...
1,2.0,1,1,677,love this place ask come every time are nyc ca...
2,3.0,1,26,7378,ramen tastes ones japan which is soup is basic...
3,3.0,1,21,369,this place liked bartender liked of food its j...
4,4.0,0,1,1217,this place has great chicken waffles great say...


In [35]:
# df.comment_text[0]
df_train.review[0]

'moved greenpoint year ago lil cafe situation this place has opened kick ass work admit have been home oysterscheese addictive sidecars staff is hot its addition hood '

In [None]:
# target_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# df.iloc[[103]][target_columns]

target_columns = ["label"]
df.iloc[[103]][target_columns]

In [None]:
df_train.shape

In [None]:
df_val.shape

In [None]:
df_test.shape

In [None]:
model_class = transformers.BertModel
tokenizer_class = transformers.BertTokenizer
pretrained_weights='bert-base-uncased'

In [None]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
bert_model = model_class.from_pretrained(pretrained_weights)

In [None]:
max_seq = 30

In [None]:
def tokenize_text(df, max_seq):
    return [
        # tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.comment_text.values
        tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.review.values
    ]


def pad_text(tokenized_text, max_seq):
    return np.array([el + [0] * (max_seq - len(el)) for el in tokenized_text])


def tokenize_and_pad_text(df, max_seq):
    tokenized_text = tokenize_text(df, max_seq)
    padded_text = pad_text(tokenized_text, max_seq)
    return torch.tensor(padded_text)


def targets_to_tensor(df, target_columns):
    return torch.tensor(df[target_columns].values, dtype=torch.float32)

In [None]:
lis = tokenize_text(df_train, max_seq)

In [None]:
print(lis)

In [None]:
train_indices = tokenize_and_pad_text(df_train, max_seq)
val_indices = tokenize_and_pad_text(df_val, max_seq)
test_indices = tokenize_and_pad_text(df_test, max_seq)

In [None]:
print(train_indices)

In [None]:
with torch.no_grad():
    x_train = bert_model(train_indices)[0]  # Models outputs are tuples
    x_val = bert_model(val_indices)[0]
    x_test = bert_model(test_indices)[0]

In [None]:
y_train = targets_to_tensor(df_train, target_columns)
y_val = targets_to_tensor(df_val, target_columns)
y_test = targets_to_tensor(df_test, target_columns)

In [None]:
x_train[0]

In [None]:
x_train[0].shape

In [None]:
y_train[0]

In [None]:
class KimCNN(nn.Module):
    def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout, static):
        super(KimCNN, self).__init__()

        V = embed_num
        D = embed_dim
        C = class_num
        Co = kernel_num
        Ks = kernel_sizes
        
        self.static = static
        self.embed = nn.Embedding(V, D)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(Ks) * Co, C)
        self.sigmoid = nn.Sigmoid()
        

    def forward(self, x):
        if self.static:
            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        output = self.sigmoid(logit)
        return output

In [None]:
embed_num = x_train.shape[1]
embed_dim = x_train.shape[2]
class_num = y_train.shape[1]
kernel_num = 3
kernel_sizes = [2, 3, 4]
dropout = 0.5
static = True

In [None]:
model = KimCNN(
    embed_num=embed_num,
    embed_dim=embed_dim,
    class_num=class_num,
    kernel_num=kernel_num,
    kernel_sizes=kernel_sizes,
    dropout=dropout,
    static=static,
)

In [None]:
n_epochs = 10
batch_size = 10
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCELoss()

In [None]:
def generate_batch_data(x, y, batch_size):
    i, batch = 0, 0
    for batch, i in enumerate(range(0, len(x) - batch_size, batch_size), 1):
        x_batch = x[i : i + batch_size]
        y_batch = y[i : i + batch_size]
        yield x_batch, y_batch, batch
    if i + batch_size < len(x):
        yield x[i + batch_size :], y[i + batch_size :], batch + 1
    if batch == 0:
        yield x, y, 1

In [None]:
train_losses, val_losses = [], []

for epoch in range(n_epochs):
    start_time = time.time()
    train_loss = 0

    model.train(True)
    for x_batch, y_batch, batch in generate_batch_data(x_train, y_train, batch_size):
        y_pred = model(x_batch)
        optimizer.zero_grad()
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= batch
    train_losses.append(train_loss)
    elapsed = time.time() - start_time

    model.eval() # disable dropout for deterministic output
    with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations
        val_loss, batch = 0, 1
        for x_batch, y_batch, batch in generate_batch_data(x_val, y_val, batch_size):
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            val_loss += loss.item()
        val_loss /= batch
        val_losses.append(val_loss)

    print(
        "Epoch %d Train loss: %.2f. Validation loss: %.2f. Elapsed time: %.2fs."
        % (epoch + 1, train_losses[-1], val_losses[-1], elapsed)
    )

In [None]:
plt.plot(train_losses, label="Training loss")
plt.plot(val_losses, label="Validation loss")
plt.legend()
plt.title("Losses")

In [None]:
model.eval() # disable dropout for deterministic output
with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations
    y_preds = []
    batch = 0
    for x_batch, y_batch, batch in generate_batch_data(x_test, y_test, batch_size):
        y_pred = model(x_batch)
        y_preds.extend(y_pred.cpu().numpy().tolist())
    y_preds_np = np.array(y_preds)

In [None]:
y_preds_np

In [None]:
y_test_np = df_test[target_columns].values

In [None]:
y_test_np[1000:]

In [None]:
auc_scores = roc_auc_score(y_test_np, y_preds_np, average=None)
df_accuracy = pd.DataFrame({"label": target_columns, "auc": auc_scores})
df_accuracy.sort_values('auc')[::-1]

In [None]:
positive_labels = df_train[target_columns].sum().sum()
positive_labels

In [None]:
all_labels = df_train[target_columns].count().sum()
all_labels

In [None]:
positive_labels/all_labels

In [None]:
df_test_targets = df_test[target_columns]
df_pred_targets = pd.DataFrame(y_preds_np.round(), columns=target_columns, dtype=int)
df_sanity = df_test_targets.join(df_pred_targets, how='inner', rsuffix='_pred')

In [None]:
df_sanity

In [None]:
df_test_targets.sum()

In [None]:
df_pred_targets.sum()

In [None]:
df_sanity[df_sanity.label > 0][['label', 'label_pred']]