In [7]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import log_softmax
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
from ast import literal_eval
import re

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
review_df = pd.read_csv('/content/drive/MyDrive/advanced_ml_proj/data/split/train.csv', usecols=['Overall Compliance', 'reviews'])
test_df = pd.read_csv('/content/drive/MyDrive/advanced_ml_proj/data/split/test.csv', usecols=['Overall Compliance', 'reviews'])
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)

# test classifying at reivew level then resturant level
# review_df = review_df.explode('reviews')
# review_df = review_df.reset_index().drop(columns=['index'])

# test_df = test_df.explode('reviews')
# test_df = test_df.reset_index().drop(columns=['index'])
review_df

Unnamed: 0,Overall Compliance,reviews
0,Yes,[water ice and pretzels were a staple evening ...
1,Yes,[We came here as I have heard that the food is...
2,No,[(3.5) ~ good overall food service.\n\nMENU:\n...
3,Yes,[Thank you cedar hollow for donating to the Ep...
4,No,[Don't go here. Period. There's not going to b...
...,...,...
1749,Yes,"['Round back there is a bar so small, so delic..."
1750,No,[Let me tell you!!!!! This place is amazing!\...
1751,Yes,[If you look up dive bar in the dictionary thi...
1752,Yes,"[Very good food, terrible service. Waited for..."


In [None]:
#mode info
model_checkpoint = "distilbert/distilbert-base-uncased"

In [1]:
#training hyperparameters
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

In [21]:
class ReviewData(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer : DistilBertTokenizer, max_tokens: int, expanded: bool = False):
        self.tokenizer = tokenizer
        self.df = df
        self.max_tokens = max_tokens
        self.expanded = expanded
        self.review_text = self.clean_text(self.df)
        self.target_cat = self.df['Overall Compliance']


    def clean_text(self, df: pd.DataFrame) -> pd.Series:

        def clean_reviews(reviews):
            cleaned = []
            for review in reviews:
                review = review.replace('\n', ' ')
                cleaned.append(re.sub(r"[^a-zA-Z0-9]", ' ', review).strip()) #may need to find a better way to do so

            return cleaned

        if self.expanded:
            df['reviews'] = df['reviews'].str.strip()
            df['reviews'] = df['reviews'].str.replace('\n', ' ')
            df['reviews'] = df['reviews'].str.replace(r"[^a-zA-Z0-9]", ' ', regex=True)

            return df['reviews']

        return df['reviews'].apply(clean_reviews)


    def __len__(self):
        return len(self.review_text)

    def __getitem__(self, index):
        review_text = str(self.review_text[index])
        target_cat = self.target_cat[index]

        if not self.expanded:
            # combine all reviews into one string
            review_text = " ".join(self.review_text[index])

        inputs = self.tokenizer.encode_plus(
            review_text,
            None,
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding='max_length',
            truncation=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        # [0, 1] = pass, [1, 0] = fail
        target = []
        if target_cat == 'No':
            target = [1, 0]
        else:
            target = [0, 1]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.float)
        }

In [22]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_data = ReviewData(review_df, tokenizer, max_tokens=MAX_TOKENS, expanded=False)
test_data = ReviewData(test_df, tokenizer, max_tokens=MAX_TOKENS, expanded=False)

In [24]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(train_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)

In [25]:
# BERT-based model (only text)

class BERTAndErnie(torch.nn.Module):
    def __init__(self):
        super(BERTAndErnie, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")

        ## TODO: add other linear classifer layer
        self.l2 = torch.nn.Linear(768, 2)

    ## TODO: args for cat features
    def forward(self, ids, mask):
        out1 = self.l1(ids, attention_mask=mask, return_dict=False)
        # pull out tensor and reshape
        hidden_layer = out1[0]
        bert_out = hidden_layer[:, 0]
        output = self.l2(bert_out)

        return output

In [26]:
# check with Claire to standardize
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [27]:
# initialize model and optimizing function
model = BERTAndErnie()
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [28]:
def validate():
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [29]:
# train the model
saved_outputs = []
saved_accuracy = []

for epoch in range(EPOCHS):

    model.train()
    for idx, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        if idx%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    preds, targets = validate()

    # play with a softmax activation function in the classifier
    accuracy = metrics.accuracy_score(targets, preds)
    saved_accuracy.append(accuracy)
    print(f"Accuracy Score = {accuracy}")

Epoch: 0, Loss:  0.7377334833145142
Epoch: 0, Loss:  0.3870224952697754
Accuracy Score = 0.8205128205128205


In [13]:
# with hugging face wrapper for sanity check
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from ast import literal_eval
import re

In [14]:
model_checkpoint = "distilbert/distilbert-base-uncased"

In [15]:
review_df = pd.read_csv('../data/split/train.csv', usecols=['Overall Compliance', 'reviews'])
test_df = pd.read_csv('../data/split/test.csv', usecols=['Overall Compliance', 'reviews'])
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)

train_ds = Dataset.from_pandas(review_df)
test_df = Dataset.from_pandas(test_df)

review_dataset = DatasetDict()

review_dataset['train'] = train_ds
review_dataset['test'] = test_df

In [11]:
def clean_reviews(reviews):
    cleaned = []
    for review in reviews:
        review = review.replace('\n', ' ')
        cleaned.append(re.sub(r"[^a-zA-Z0-9]", ' ', review).strip()) #may need to find a better way to do so

    return cleaned

def extract_text_features(data):
    output = {}
    reviews = clean_reviews(data['reviews'])
    output['text'] = " ".join(reviews)
    output['label'] = 0 if data['Overall Compliance'] == 'Yes' else 0

    return output

In [16]:
review_dataset = review_dataset.map(extract_text_features)

Map: 100%|██████████| 1754/1754 [00:00<00:00, 7276.02 examples/s]
Map: 100%|██████████| 195/195 [00:00<00:00, 6425.45 examples/s]


In [29]:
review_dataset['train'][0].keys()

dict_keys(['Overall Compliance', 'reviews', 'text', 'label'])

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(examples):
    return tokenizer(examples["text"], max_length=MAX_TOKENS, truncation=True)

In [24]:
token_dataset = review_dataset.map(tokenize, remove_columns=['Overall Compliance', 'reviews'])

Map: 100%|██████████| 1754/1754 [00:03<00:00, 570.24 examples/s]
Map: 100%|██████████| 195/195 [00:00<00:00, 541.34 examples/s]


In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

In [27]:
id2label = {0: "Pass", 1: "Fail"}
label2id = {"Pass": 0, "Fail": 1}

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=token_dataset["train"],
    eval_dataset=token_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
