In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/My Drive/prep/'

In [3]:

from sklearn.datasets import load_files

def prepare_data():
  train_ds = load_files('data/train',encoding='latin-1')
  train_text, train_labels = train_ds.data, train_ds.target

  test_ds = load_files('data/test', encoding='latin-1')
  test_text, test_labels = test_ds.data, test_ds.target

  return train_text, train_labels, test_text, test_labels

  
train_text, train_labels, test_text, test_labels = prepare_data()

In [4]:
print(len(train_text))
print(len(train_labels))
print(len(test_text))
print(len(test_labels))

1402
1402
602
602


In [5]:
type(train_labels)

numpy.ndarray

In [6]:
#@markdown install required libs
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 8.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, classification_report

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
import torch
import numpy as np

class Dataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.labels = labels
        self.encodings = encodings

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#label_encoder = LabelEncoder()
#train_df.Sentiment = label_encoder.fit_transform(train_df.Sentiment)
#val_df.Sentiment = label_encoder.fit_transform(val_df.Sentiment)

#list_train_text = (list)(train_df.Tweets.values)
train_encodings = tokenizer(train_text, truncation=True, padding=True, max_length=300)

#list_val_text = (list)(val_df.Tweets.values)
val_encodings = tokenizer(test_text, truncation=True, padding=True, max_length=300)

train_dataset, val_dataset = Dataset(train_encodings, train_labels), Dataset(val_encodings, test_labels)

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
 
def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)

   accuracy = accuracy_score(labels, predictions)
   precision = precision_score(labels, predictions, average='macro')
   recall = recall_score(labels, predictions, average='macro')
   f1 = f1_score(labels, predictions, average='macro')
   print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
   target_class = ['negative', 'positive']
   print(classification_report(labels, predictions, target_names = target_class))

   return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [12]:
from transformers import TrainingArguments, Trainer
  
training_args = TrainingArguments(
   output_dir='./sentiment_result',
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   #save_strategy="steps",
   logging_dir='./logs',
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=val_dataset,
   compute_metrics=compute_metrics,
)

In [13]:
trainer.train()


***** Running training *****
  Num examples = 1402
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 176


Step,Training Loss


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




NameError: ignored

In [14]:
torch.save(model, '/content/drive/My Drive/prep/model-sentim.pth')
trainer.save_model()

Saving model checkpoint to ./sentiment_result
Configuration saved in ./sentiment_result/config.json
Model weights saved in ./sentiment_result/pytorch_model.bin


In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 602
  Batch size = 16


accuracy = 0.809, precision = 0.820, recall = 0.809, f1 = 0.807
              precision    recall  f1-score   support

    negative       0.88      0.71      0.79       301
    positive       0.76      0.90      0.83       301

    accuracy                           0.81       602
   macro avg       0.82      0.81      0.81       602
weighted avg       0.82      0.81      0.81       602



{'epoch': 2.0,
 'eval_accuracy': 0.8089700996677741,
 'eval_f1': 0.8072419985800003,
 'eval_loss': 0.4342592656612396,
 'eval_precision': 0.8204620386482279,
 'eval_recall': 0.8089700996677741,
 'eval_runtime': 606.4451,
 'eval_samples_per_second': 0.993,
 'eval_steps_per_second': 0.063}

In [6]:
import torch
path_to_model = '/content/drive/My Drive/prep/model-sentim.pth'
model_m = torch.load(path_to_model)

In [None]:
!pip install lime

In [7]:
import torch.nn.functional as F

def predictor(texts):
  outputs = model_m(**tokenizer(texts, max_length = 300, truncation = True, return_tensors="pt", padding=True))
  probas = F.softmax(outputs.logits).numpy()
  return probas

In [8]:
# importing the libraries
import lime
import sklearn.ensemble
from __future__ import print_function
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

# saving a list of strings version of the X_test object
ls_X_test= list(test_text)

# saving the class names in a dictionary to increase interpretability
class_names = {'negative','positive'}

# create the LIME explainer
# add the class names for interpretability
LIME_explainer = LimeTextExplainer(class_names=class_names)

In [None]:
STR = "The movie was awesome"
exp = LIME_explainer.explain_instance(STR, predictor, num_features=5)

In [None]:
exp.show_in_notebook(text=True)