In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stsgold-dataset/sts_gold_tweet.csv
/kaggle/input/amazon-fine-food-reviews/hashes.txt
/kaggle/input/amazon-fine-food-reviews/Reviews.csv
/kaggle/input/amazon-fine-food-reviews/database.sqlite


In [2]:
!pip install transformers datasets scikit-learn torch



In [4]:
import torch
from transformers import BertModel, BertPreTrainedModel
from torch import nn

class BERTCNN(BertPreTrainedModel):
    def __init__(self, config):
        super(BERTCNN, self).__init__(config)
        self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
        self.conv = nn.Conv1d(in_channels=config.hidden_size, out_channels=128, kernel_size=5, padding=2)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(128, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
        sequence_output = outputs[0]
        sequence_output = sequence_output.permute(0, 2, 1)
        x = self.conv(sequence_output)
        x = self.pool(x).squeeze(-1)
        x = self.dropout(x)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits

In [142]:
import torch
import torch.nn as nn
from transformers import RobertaModel

class RoBERTa(nn.Module):
    def __init__(self, model_type):
        super(RoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_type)

    def forward(self, input_ids, attention_mask=None):
        return self.roberta(input_ids=input_ids, attention_mask=attention_mask)

In [8]:
!pip install transformers[torch] accelerate -U

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.30.1
    Uninstalling accelerate-0.30.1:
      Successfully uninstalled accelerate-0.30.1
Successfully installed accelerate-0.31.0


In [174]:
import pandas as pd
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, BertConfig
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch

def map_score_to_sentiment(score):
    if score < 3:
      return 0  # Negative
    elif score == 3:
      return 1  # Neutral
    else:
      return 2  # Positive
            
def load_data():
    df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv', on_bad_lines='warn', nrows=100000)
    df = df.head(100000)
    print(df.shape[0], "Data")

    df['Sentiment'] = df['Score'].apply(map_score_to_sentiment)

    train_texts, test_texts, train_labels, test_labels = train_test_split(
      df['Text'], df['Sentiment'], test_size=0.4, random_state=42)

    train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
    test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

    return train_df, test_df

def tokenize_data(model_type, train_df, test_df):
    tokenizer = AutoTokenizer.from_pretrained(model_type)

    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

    train_dataset = Dataset.from_pandas(train_df).map(
        tokenize_function, batched=True)
    test_dataset = Dataset.from_pandas(test_df).map(
        tokenize_function, batched=True)

    return train_dataset, test_dataset, tokenizer

def train_model(model, train_dataset, test_dataset, tokenizer, output_dir):
    training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2,
        save_steps=500,
        eval_strategy="steps",
        eval_steps=500,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer
    )

    trainer.train()

In [182]:
train_df, test_df = load_data()
print(train_df.shape[0])
print(test_df.shape[0])

100000 Data
60000
40000


In [178]:
bert_model_type = 'bert-base-uncased'
bert_cnn_config = BertConfig.from_pretrained(bert_model_type, num_labels=3)
train_dataset, test_dataset, tokenizer = tokenize_data(bert_model_type, train_df, test_df)
bert_cnn_model = BERTCNN(config=bert_cnn_config)
train_model(bert_cnn_model, train_dataset, test_dataset, tokenizer, './bert_cnn_results')



Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
500,0.4796,0.522757
1000,0.3636,0.362736
1500,0.4572,0.35229
2000,0.4551,0.310538
2500,0.2673,0.324507
3000,0.278,0.35392
3500,0.3864,0.314407
4000,0.1493,0.386761
4500,0.1913,0.346772
5000,0.2533,0.312312


In [2]:
roberta_model_type = 'cardiffnlp/twitter-roberta-base-sentiment'
train_dataset, test_dataset, tokenizer = tokenize_data(roberta_model_type, train_df, test_df)
roberta_model = RoBERTa(model_type=roberta_model_type)
if torch.cuda.is_available():
    roberta_model.cuda()
roberta_model = roberta.get_model()
train_model(roberta_model, train_dataset, test_dataset, tokenizer, './roberta_results')

NameError: name 'tokenize_data' is not defined

In [179]:
import numpy as np

In [180]:
def test_model(model, test_dataset):
    trainer = Trainer(model=model)
    result = trainer.predict(test_dataset)
    prediction = np.argmax(result.predictions, axis=1)
    return result, prediction

In [68]:
def tokenize_test_data(model_type, test_df):
    tokenizer = AutoTokenizer.from_pretrained(model_type)
    
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    
    test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)
    
    return test_dataset, tokenizer

In [184]:
bert_model_type = 'bert-base-uncased'
test_dataset, tokenizer = tokenize_test_data(bert_model_type, test_df)
bert_cnn_result, bert_cnn_preds = test_model(bert_cnn_model, test_dataset)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [146]:
roberta_model_type = 'cardiffnlp/twitter-roberta-base-sentiment'
test_dataset, tokenizer = tokenize_test_data(roberta_model_type, test_df)
roberta_result, roberta_preds = test_model(roberta_model, test_dataset)



Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [185]:
def compare(model_result, model_preds, model_type):
    print(model_type)
    cases = ['negative', 'neutral', 'positive']

    predictions_map = {
        'negative': [],
        'neutral': [],
        'positive': [],
    }
    truth_map = {
        'negative': [],
        'neutral': [],
        'positive': [],
    }
    falsy_map = {
        'negative': [],
        'neutral': [],
        'positive': [],
    }

    for i, (result, preds) in enumerate(zip(model_result, model_preds)):
        score = test_df['label'].iloc[i]
        truth_map[cases[score]].append(i)
        predictions_map[cases[preds]].append(i)
        if score > 0 and preds == 0:
            falsy_map[cases[0]].append(i)
        elif score != 1 and preds == 1:
            falsy_map[cases[1]].append(i)
        elif score <2 and preds == 2:
            falsy_map[cases[2]].append(i)


    total_data = len(predictions_map[cases[0]]) + len(predictions_map[cases[1]]) + len(predictions_map[cases[2]])

    print("Predictions")
    print(f'Negative:{len(predictions_map[cases[0]])} | Neutral: {len(predictions_map[cases[1]])} | Positive: {len(predictions_map[cases[2]])}')
    print("============\n")
    print("Truth")
    print(f'Negative:{len(truth_map[cases[0]])} | Neutral: {len(truth_map[cases[1]])} | Positive: {len(truth_map[cases[2]])}')
    print("============\n")
    print("False Positives")
    print(f'Negative:{len(falsy_map[cases[0]])} ({len(falsy_map[cases[0]])/len(truth_map[cases[0]])*100})| Neutral: {len(falsy_map[cases[1]])} ({len(falsy_map[cases[1]])/len(truth_map[cases[1]])*100})| Positive: {len(falsy_map[cases[2]])} ({len(falsy_map[cases[2]])/len(truth_map[cases[2]])*100})')
    
    print("============\n\n\n")

In [4]:
compare(bert_cnn_result.predictions, bert_cnn_preds,'bert-cnn')
# compare(roberta_result.predictions, roberta_preds, roberta_model_type)

NameError: name 'compare' is not defined

NameError: name 'compare' is not defined