In [None]:
from typing import Union, Tuple, List
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast
import numpy as np
import random
import pandas as pd
from datetime import datetime, date
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score
from tqdm import tqdm
# from IPython.display import Image
from joblib import Parallel, delayed
import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch
from transformers import BertTokenizer, BertModel
import re
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)
from torch.utils.data.dataset import Dataset
import matplotlib.pyplot as plt
from transformers import DataCollatorForLanguageModeling
import os
from transformers import Trainer, TrainingArguments
from transformers.utils import logging
logger = logging.get_logger(__name__)
from filelock import FileLock
import time
import pickle

In [None]:
review_df = pd.read_csv('/opt/ml/wine/data/review_df_total.csv',encoding = 'utf-8-sig').loc[:,['user_url','rating','text','wine_url']]

In [None]:
import json
with open('/opt/ml/wine/code/data/feature_map/item2idx.json','r') as f:
    item2idx = json.load(f)

In [None]:
review_df = review_df[review_df['text'].isna()==False]

In [None]:
review_df['text'] = review_df['text'].apply(lambda x: x + '.' if x[-1] != '.' else x)

In [None]:
def keep_english_and_digits(text):
    # Remove any characters that are not English alphabets, digits, periods, or commas at the end of sentences
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,]', '', text)
    return clean_text

In [None]:
def merge_text(data):
    return '\n'.join(data)


In [None]:
review_df['text'] = review_df['text'].apply(keep_english_and_digits)

In [None]:
review_df['wine_id'] = review_df['wine_url'].map(item2idx)
review_df = review_df[review_df['wine_id'].isna()==False]
review_df['wine_id'] = review_df['wine_id'].astype('int').astype('category')

In [None]:
wine_df = pd.read_csv('/opt/ml/wine/data/wine_df.csv')

In [None]:
note_df = wine_df.filter(like='_child')

In [None]:
notes = {}
import ast
def str2dict(x):
    try: return ast.literal_eval(x)
    except: return {}
def get_keys(x):
    return set(x.keys())

for col in note_df.columns:
    note_df.loc[:,col] = note_df.loc[:,col].apply(str2dict)
    sub_note = set()
    for i in tqdm(range(len(note_df))):
        subs = get_keys(note_df[col][i])
        sub_note = sub_note | subs
    notes[col.replace('_child','')] = sub_note

In [None]:
text_with_note = review_df.drop(['rating','wine_id'], axis = 1)

In [64]:
def check_note_in_review(text, notes_data):
    text = text.lower()
    result = []
    for key in notes:
        if any(word in text for word in notes[key]):
            result.append(1)
        else: result.append(0)
    return result

def marking_data(df, notes_data):
    df.reset_index(inplace = True)
    note_df = []
    for i in tqdm(range(len(df))):
        note_onehot = check_note_in_review(df.loc[i,'text'], notes_data)
        note_df.append(note_onehot)
    
    note_df = pd.DataFrame(note_df, columns = notes_data.keys())
    merged_df = pd.concat([df, note_df], axis=1)

    return merged_df


def parallel_dataframe_2input(func, df, notes_data, num_cpu):

    chunks = np.array_split(df, num_cpu)

    print('Parallelizing with ' +str(num_cpu)+'cores')
    with Parallel(n_jobs = num_cpu, backend="multiprocessing") as parallel:
        results = parallel(delayed(func)(chunks[i], notes_data) for i in range(num_cpu))

    for i,data in enumerate(results):
        if i == 0:
            output = data
        else:
            output = output.reset_index(drop=True)
            data = data.reset_index(drop=True)
            output += pd.concat([output, data], axis=0)

    return output

In [65]:
text = pd.DataFrame(text_with_note.loc[:,'text'])

In [66]:
note_marked_text = parallel_dataframe_2input(marking_data, text, notes, 8)

Parallelizing with 8cores


 35%|███▍      | 320766/928505 [01:22<03:35, 2821.02it/s]

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self, num_labels):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, num_labels)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)