## Classification of tweets with personal content


In [1]:
pip install spacy

Error processing line 1 of /usr/local/lib/python3.6/dist-packages/vision-1.0.0-nspkg.pth:

  Traceback (most recent call last):
    File "/usr/lib/python3.6/site.py", line 174, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
    File "<frozen importlib._bootstrap>", line 568, in module_from_spec
  AttributeError: 'NoneType' object has no attribute 'loader'

Remainder of file ignored
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import spacy

In [2]:
import pandas as pd
import torch
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
personal_tweets = pd.read_csv("../input/data/personal_tweets_labeled.csv", sep=";").sample(n=500, random_state=0)
personal_tweets.head()

Unnamed: 0,id,text,"personal(0=no,1=yes)"
3442,2044,High salt intake linked to diabetes risk : Stu...,0
3472,2074,Noodle the schnauzer was my beloved pal and se...,0
1789,391,The latest The Podiatry Foot health care Daily...,0
3735,2337,ED Recovery News is out ! HTTPURL #bulimia #he...,0
214,214,"USER Type 1 diabetes , depression , and abnorm...",1


### Preprocess tweets 
according to the preprocessing of the model we want to use -> BERTweet : https://huggingface.co/vinai/bertweet-base

In [4]:
######### Normalization function of BERTweet ###########

tokenizer = TweetTokenizer()

# https://huggingface.co/vinai/bertweet-base
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

        
def normalizeTweet(tweet):
    
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())

In [5]:
############# Split data in Train / Test / Validation ##########

from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

text = personal_tweets["text"].map(normalizeTweet).values.tolist()
labels = personal_tweets["personal(0=no,1=yes)"].values.tolist()

# split data into training, test and validation set
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=0.33, random_state=22)
train_texts, val_texts, train_labels, val_labels = train_test_split(text, labels, test_size=0.2, random_state=22)
print("Train: {}".format(len(train_texts)))
print("Val: {}".format(len(val_texts)))
print("Test: {}".format(len(test_texts)))


Train: 400
Val: 100
Test: 165


In [6]:
################ load tokenizer ################

#TODO: Search this model on Huggingface, and read about the specs
bert_model_name = "vinai/bertweet-base" 

tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
############## encode tweets #####################

# truncation, padding : ensures that all sentences are padded to the same length and are truncated to be no longer model's max input lengts
# => allows to feed batches of sequences 
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

#print(val_encodings.keys())
#train_encodings

In [12]:
# PyTorch neural networks require the input via the "Dataset" class
# Transform class labels + encodings into Pytorch DataSet object (including __len__, __getitem__)

class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels != None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        if self.labels != None:
            return len(self.labels)
        else:
            return len((self.encodings["input_ids"]))

        
train_dataset = TweetDataSet(train_encodings, train_labels)
val_dataset = TweetDataSet(val_encodings, val_labels)
test_dataset = TweetDataSet(test_encodings, test_labels)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))


400
100
165


### Training

In [16]:
##################### TRAINING #########################

# adjust the Training parameters as you like:
# https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training parameters
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = "epoch"     

)

# AutoModelForSequenceClassification adds a fully connected layer after BERT
# We want to use a pre-trained transformer model (BERTweet) and 
# fine-tune it for sentence (tweet) classification
model = AutoModelForSequenceClassification.from_pretrained(bert_model_name)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # validation dataset
)

trainer.train()     # starts training

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/vinai/bertweet-base/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/356366feedcea0917e30f7f235e1e062ffc2d28138445d5672a184be756c8686.a2b6026e688d1b19cebc0981d8f3a5b1668eabfda55b2c42049d5eac0bc8cb2d
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
 

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6871,0.677903,0.72,0.222222,0.571429,0.137931


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13, training_loss=0.6856365754054143, metrics={'train_runtime': 2.0283, 'train_samples_per_second': 197.206, 'train_steps_per_second': 6.409, 'total_flos': 15005552376000.0, 'train_loss': 0.6856365754054143, 'epoch': 1.0})

In [17]:
########### Evaluate on test set ##################


eval_output = trainer.evaluate(test_dataset) 
eval_output

***** Running Evaluation *****
  Num examples = 165
  Batch size = 16


{'eval_loss': 0.6817895174026489,
 'eval_accuracy': 0.6727272727272727,
 'eval_f1': 0.15625,
 'eval_precision': 0.5555555555555556,
 'eval_recall': 0.09090909090909091,
 'eval_runtime': 0.2497,
 'eval_samples_per_second': 660.742,
 'eval_steps_per_second': 44.049,
 'epoch': 1.0}

In [None]:
############# Save your model ###############
#trainer.save_model("output/myPersonalClassifier.model")

## Apply your trained model on the cancer tweets

Alternatively, you can use the model I fine-tuned on more epochs:

In [22]:
# OPTIONAL: Either continue with your trained model or load this fine-tuned model (personal content binary classifier)
 
model_trained = AutoModelForSequenceClassification.from_pretrained("../input/models/Bert_Vinai_personal_08022021")

loading configuration file ../input/models/Bert_Vinai_personal_08022021/config.json
Model config RobertaConfig {
  "_name_or_path": "./results/checkpoint-216",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertweetTokenizer",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001
}

loading weights file ../input/models/Bert_Vinai_personal_08022021/pytorch_model.bin
All model checkpoint weights were used when initializing 

### Test trained model on cancer tweets

For you to play! 

The cancer tweets are in following file:
"data/Kap_Code_Bladder_Cancer_data_light_2021_11_17_2021_11_17.xlsx"

In [27]:
pip install openpyxl

Error processing line 1 of /usr/local/lib/python3.6/dist-packages/vision-1.0.0-nspkg.pth:

  Traceback (most recent call last):
    File "/usr/lib/python3.6/site.py", line 174, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
    File "<frozen importlib._bootstrap>", line 568, in module_from_spec
  AttributeError: 'NoneType' object has no attribute 'loader'

Remainder of file ignored
Defaulting to user installation because normal site-packages is not writeable
Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
     |################################| 242 kB 4.9 MB/s            
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9
Note: you may need to restart the kernel to use updated packages.


In [28]:
############ load cancer tweets #################
# Load the presence corpus : 
presence_corpus = pd.read_excel("../input/data/Kap_Code_Bladder_Cancer_data_light_2021_11_17_2021_11_17.xlsx", engine="openpyxl")
presence_corpus.head()

Unnamed: 0,post_id,post_date,message
0,1,2021-03-04 23:40:10.0,"@DrLAckerman @BundrickStewart What happens is,..."
1,2,2021-03-04 23:38:19.0,God isn't this too much for me to carry?! I've...
2,3,2021-03-04 22:25:22.0,@ItIsLostAdrift i mean technically when u have...
3,4,2021-03-04 22:24:21.0,Congratulations to Dr. Justin Sausville and te...
4,5,2021-03-04 22:15:16.0,Don't miss @EngIPM Clinical Director Dr. Cora ...


In [29]:
################ Normalize cancer tweets #####################
############# Split data in Train / Test / Validation ##########

from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

text = presence_corpus["message"].map(normalizeTweet).values.tolist()
# => allows to feed batches of sequences 
cancer_encodings = tokenizer(text, truncation=True, padding=True)


In [None]:
################# Tokenize and DataSet ##################

In [None]:
################# Predict (apply classifier) ##############

In [None]:
########## Apply softmax on predictions to get probabilites #####################

softmax = torch.nn.Softmax(-1)



In [None]:
############ Save only personal (label: 1) tweets to file #######################