# Predict Tweet's text with TimeLM

This notebook contains the code to:
*   Predict the tweet's text with the [TimeLM model](https://github.com/cardiffnlp/timelms).
*   Compare the statistics of the B-T4SA 1.0 and of the B-T4SA 1.0 updated. It is used to create the confusion matrix which is present in the documentation.





In [None]:
# dataset import
from google.colab import drive
drive.mount('/content/drive')

!pip -q install transformers

import os
import sys
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.special import softmax
from IPython.display import display
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification

Mounted at /content/drive
[K     |████████████████████████████████| 4.7 MB 8.0 MB/s 
[K     |████████████████████████████████| 101 kB 8.1 MB/s 
[K     |████████████████████████████████| 596 kB 42.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 36.0 MB/s 
[?25h

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
BASE_DIR = "/content/drive/MyDrive/Thesis/"
DATASET_DIR = BASE_DIR + "dataset/t4sa_2.0/"
NEW_PREDICTION_DIR = DATASET_DIR + "labeling/"
PREDICTION_DIR = BASE_DIR + "predictions/"

# Utility mappers to switch different labeling modality
class_mapper = {"Negative":0, "Neutral":1, "Positive": 2}

# Predict text emotion

In [None]:
def preprocess(text):
  ''' Preprocess text (username and link placeholders)'''
  new_text = []
  for t in text.split(" "):
      t = '@user' if t.startswith('@') and len(t) > 1 else t  # some simple preprocessing to remove useless information,
      t = 'http' if t.startswith('http') else t               # which are just noise
      new_text.append(t)
  return " ".join(new_text)


In [None]:
# set the device on which computation is performed
print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

False
cpu


## Predict B-T4SA 1.0

In [None]:
# load the dataframe with the tweet's text
tweets_annot = pd.read_csv(DATASET_DIR + "raw_tweets_text.csv", delimiter=",")
id_all = tweets_annot["id"].to_list()
text_all = tweets_annot["text"].to_list()

# load the dataframe with the image path and the assigned label (using previous text classifier)
bt4sa = pd.read_csv(DATASET_DIR + "original_text_predictions/b-t4sa_all.txt", names=["path", "label"], delimiter=" ", header=None)
id_bt4sa = bt4sa['path'].to_list()

# i want to get only the ID to filter the tweets that are not present in B-T4SA
for i in range(len(id_bt4sa)):
  id_bt4sa[i] = int(id_bt4sa[i][11:29])  # I just take the ID, which is contained within the index [11, 29) of the image path

# I want to remove duplicates since there are multiple images related to the same tweets, so identical ID in the list
id_bt4sa_filtered = set(id_bt4sa)   

# find the intersection between text and bt4sa
intersection = [value for value in id_all if value in id_bt4sa_filtered]   

In [None]:
# I build a dictionary with (ID, Text) items for each element in b-t4sa 
id_bt4sa = []
text_bt4sa = []

for i in range(len(id_all)):
  if id_all[i] in id_bt4sa_filtered:
    id_bt4sa.append(id_all[i])
    text_bt4sa.append(text_all[i])

In [None]:
# Sanity check to verify that are present all the text related to the images in b-t4sa
assert len(intersection) == len(id_bt4sa_filtered)

print(len(intersection))

print(len(id_bt4sa))
dict_tweet = dict(zip(id_bt4sa, text_bt4sa))
dict_all_tweet =  dict(zip(id_all, text_all))

intersect_dict = dict(dict_all_tweet.items() & dict_tweet.items())  

assert intersect_dict == dict_tweet
print(len(intersect_dict), len(dict_tweet), len(dict_all_tweet))

In [None]:
# Get the tokenizer of the TimeLM model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Get the configuration of the TimeLM model
config = AutoConfig.from_pretrained(MODEL)

# Get the TimeLM model
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

In [None]:
# initialize the data structures that holds the predictions
id_preds = []
neg_preds = []
neu_preds = []
pos_preds = []

# Counter for the Backup
counter=0

for i in tqdm(range(len(text_bt4sa))):  
  id = id_bt4sa[i]
  text = text_bt4sa[i]

  if dict_tweet[id] != text:  # Check that everything is correct (can be avoided)
    print("ERORR!")
    break

  # Preprocess the tweet
  text = preprocess(text)

  # Tokenize the text
  encoded_input = tokenizer(text, return_tensors='pt').to(device)
  
  # Get the logits produced by the model
  output = model(**encoded_input)

  # Get a probability distribution from the logits
  scores = output[0][0].detach().cpu().numpy()
  scores = softmax(scores)

  id_preds.append(id) 
  neg_preds.append(scores[0])
  neu_preds.append(scores[1])
  pos_preds.append(scores[2])

  # Every 50 tweets I save a backup 
  if i%50==0 and i != 0:
    new_predictions = pd.DataFrame({'TWID': id_preds, 'NEG': neg_preds, 'NEU': neu_preds, 'POS': pos_preds})
    new_predictions.to_csv("bt4sa_predictions_backup_" + str(counter) + ".tsv", sep="\t", index = False)
    counter+=1

# Save the dataframe with all the predictions
new_predictions = pd.DataFrame({'TWID': id_preds, 'NEG': neg_preds, 'NEU': neu_preds, 'POS': pos_preds})
new_predictions.to_csv(DATASET_DIR + "bt4sa_predictions_con_overlap.tsv", sep="\t", index = False)

## Predict T4SA 2.0

In [None]:
tweets_annot = pd.read_csv(DATASET_DIR + "raw_tweets_text_final.csv", on_bad_lines='skip')
display(tweets_annot)
id_all = tweets_annot["id"].to_list()
text_all = tweets_annot["text"].to_list()

dict_tweet = dict(zip(id_all, text_all))

In [None]:
# Get the tokenizer of the TimeLM model
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Get the configuration of the TimeLM model
config = AutoConfig.from_pretrained(MODEL)

# Get the TimeLM model and move it to the GPU
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

In [None]:
# initialize the data structures that holds the predictions
id_preds = []
neg_preds = []
neu_preds = []
pos_preds = []

# Counter for the Backup
counter=0

for i in tqdm(range(len(text_all))[2500000:]):  
  id = id_all[i]
  text = text_all[i]
  try:
    if dict_tweet[id] != text:  # Check that everything is correct (can be avoided)
      print("ERORR! WIth text", dict_tweet[id])
      raise Exception

    # Preprocess the tweet
    text = preprocess(text)

    # Tokenize the text
    encoded_input = tokenizer(text, return_tensors='pt').to(device)


  # Get the logits produced by the model
    output = model(**encoded_input)

    # Get a probability distribution from the logits
    scores = output[0][0].detach().cpu().numpy()
    scores = softmax(scores)

    id_preds.append(id) 
    neg_preds.append(scores[0])
    neu_preds.append(scores[1])
    pos_preds.append(scores[2])

  except Exception as ex:
    exc_info = sys.exc_info()
    print(exc_info)
    id_preds.append(id)
    neg_preds.append(-1)
    neu_preds.append(-1)
    pos_preds.append(-1)

  # Every 50 tweets I save a backup 
  if i%50000==0 and i != 2500000:
    print(f"Saving backup after {i} images...")
    new_predictions = pd.DataFrame({'TWID': id_preds, 'NEG': neg_preds, 'NEU': neu_preds, 'POS': pos_preds})
    new_predictions.to_csv(NEW_PREDICTION_DIR + "t4sa2.0_predictions_latest_backup_test_" + str(counter) + ".tsv", sep="\t", index = False)
    counter+=1

# Save the dataframe with all the predictions
new_predictions = pd.DataFrame({'TWID': id_preds, 'NEG': neg_preds, 'NEU': neu_preds, 'POS': pos_preds})
new_predictions.to_csv(NEW_PREDICTION_DIR + "bt4sa_predictions_latest_test_con_overlap.tsv", sep="\t", index = False)

 10%|█         | 49992/479630 [10:25<1:28:31, 80.88it/s]

Saving backup after 2550000 images...


 21%|██        | 100000/479630 [20:44<1:14:59, 84.38it/s]

Saving backup after 2600000 images...


 31%|███▏      | 149996/479630 [30:50<1:05:37, 83.72it/s]

Saving backup after 2650000 images...


 42%|████▏     | 199999/479630 [41:17<57:12, 81.47it/s]

Saving backup after 2700000 images...


 52%|█████▏    | 249997/479630 [51:50<52:06, 73.46it/s]

Saving backup after 2750000 images...


 63%|██████▎   | 299997/479630 [1:02:22<37:09, 80.59it/s]

Saving backup after 2800000 images...


 73%|███████▎  | 349993/479630 [1:12:48<26:24, 81.84it/s]

Saving backup after 2850000 images...


 83%|████████▎ | 399996/479630 [1:23:15<15:50, 83.80it/s]

Saving backup after 2900000 images...


 94%|█████████▍| 449992/479630 [1:33:20<05:49, 84.77it/s]

Saving backup after 2950000 images...


100%|██████████| 479630/479630 [1:39:19<00:00, 80.48it/s]


# Create files with predictions

## Merge multiple files

In [None]:
# Merge different backup files
file_list = [f for f in os.listdir(NEW_PREDICTION_DIR) if os.path.isfile(os.path.join(NEW_PREDICTION_DIR, f))]
print(f"Merging {len(file_list)} files...")

df_list = []
for df in file_list:
  df_list.append(pd.read_csv(os.path.join(NEW_PREDICTION_DIR, df), delimiter="\t"))

# Concat all on the 0 axis
tweets_preds_all = pd.concat(df_list, axis=0)

Merging 35 files...


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Merging from multiple files
print("Merged size:", len(tweets_preds_all))

# Cast the TWID column in string
tweets_preds_all["TWID"] = tweets_preds_all["TWID"].apply(str) 

# Files may contain many duplicates with different precision in the predictions, so i filter just to keep only one prediction per TWID
tweets_preds_all.drop_duplicates(subset="TWID", keep="first", inplace=True)
print("Size after duplicate removal:", len(tweets_preds_all))

# Start the cleaning

# Remove NaN values
tweets_preds_all.dropna(inplace=True)

# Remove all the tweets that were collected after 30th of June (they all starts with 15427)
tweets_preds_all = tweets_preds_all[~tweets_preds_all['TWID'].str.lower().str.startswith("15427")]

# Remove all the bad TWID if any
tweets_preds_all = tweets_preds_all[tweets_preds_all['TWID'].str.lower().str.startswith("1")]

# Sort by TWID (coincide with sort in chronological order)
tweets_preds_all.sort_values("TWID", inplace=True)

# Reset the index
tweets_preds_all.reset_index(inplace=True, drop=True)
already_inferred_tweets = tweets_preds_all["TWID"].tolist()
print("Size after NaN removal:", len(tweets_preds_all))
print("Number of already labeled tweets:", len(already_inferred_tweets))

tweets_annot = pd.read_csv(DATASET_DIR + "raw_tweets_text_final.csv", lineterminator='\n')

tweets_annot["id"] = tweets_annot["id"].apply(str) 
total_tweets = tweets_annot["id"].tolist()
print("Size of total tweets:", len(total_tweets))

# Counting the number of tweets that are not labeled
remaining_tweets = list(set(total_tweets) - set(already_inferred_tweets))
print("Remaining tweets to label:", len(remaining_tweets))

Merged size: 11886424
Size after duplicate removal: 2979626
Size after NaN removal: 2970114
Number of already labeled tweets: 2970114
Size of total tweets: 2970114
Remaining tweets to label: 0


In [None]:
display(pd.merge(tweets_preds_all, tweets_annot, how='inner',right_on=['id'], left_on=['TWID']).drop('id', axis=1))

Unnamed: 0,TWID,NEG,NEU,POS,text
0,1510943913678450696,0.003139,0.065183,0.931678,never forget when Kanye and Jay Z performed N*...
1,1510943913686691848,0.003431,0.055094,0.941475,"When it comes to style in warm day, less reall..."
2,1510943922100781057,0.005960,0.062065,0.931975,Grey Suit is so beautiful ☺💕\n\nhttps://t.co/n...
3,1510943947245371393,0.017227,0.494703,0.488071,A little D heavy..\n\nZach Tom will be an athl...
4,1510943951443820547,0.045747,0.912404,0.041849,.@SarahLGates1\n\nIslam &amp; Christianity use...
...,...,...,...,...,...
2970109,1542699945584922624,0.005857,0.314748,0.679395,Watch the sunset with me. \n\nLine art by @gab...
2970110,1542699962345086977,0.008627,0.076977,0.914396,"D'instinct j'ai lu ""RM is the best leader"" 😂 h..."
2970111,1542699970750337024,0.005689,0.038025,0.956286,Imam Ali and his wife Fatima are the most loya...
2970112,1542699974944645121,0.016026,0.920362,0.063612,༺ཻ.RECOMMENDED ! \n────➤\n ✦ @DaniellaPutri_...


In [None]:
# Save the dataframe with all the predictions
tweets_preds_all.to_csv(DATASET_DIR + "t4sa2.0_text_prediction_final.csv", index=False)
pd.read_csv(DATASET_DIR + "t4sa2.0_text_prediction_final.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,TWID,NEG,NEU,POS
0,1510943913678450696,0.003139,0.065183,0.931678
1,1510943913686691848,0.003431,0.055094,0.941475
2,1510943922100781057,0.005960,0.062065,0.931975
3,1510943947245371393,0.017227,0.494703,0.488071
4,1510943951443820547,0.045747,0.912404,0.041849
...,...,...,...,...
2970109,1542699945584922624,0.005857,0.314748,0.679395
2970110,1542699962345086977,0.008627,0.076977,0.914396
2970111,1542699970750337024,0.005689,0.038025,0.956286
2970112,1542699974944645121,0.016026,0.920362,0.063612


# Analysis of two predictions

In [None]:
def print_length_statistics(annot):
  for class_ in classes_mapper.keys():
    print(f"Length of {class_}: {len(annot[annot['label'] == class_mapper[class_]]):,}")

In [None]:
# Load the dataframe with the old and the new predictions
new_predictions = pd.read_csv(DATASET_DIR + "new_text_predictions/bt4sa/b-t4sa_all.tsv", delimiter="\t")
original_predictions = pd.read_csv(DATASET_DIR + "original_text_predictions/b-t4sa_all.txt", names=["path", "label"], delimiter=" ", header=None)

# Store in an array all the new and the old labels
new_label = new_predictions['label'].to_list()
original_label = original_predictions['label'].to_list()

In [None]:
print("Print statistics of original predictions:")
print_length_statistics(original_predictions)

print("\n\nPrint statistics of updated predictions:")
print_length_statistics(new_predictions)

Print statistics of original predictions:
Length of Negative: 156,862
Length of Neutral: 156,862
Length of Positive: 156,862


Print statistics of updated predictions:
Length of Negative: 95,272
Length of Neutral: 182,295
Length of Positive: 193,019


In [None]:
# the base polarity considered is the original one

# Initialization of data structure to count the difference
count_diff = 0
count_diff_class = [[0, 0, 0],
                    [0, 0, 0],
                    [0, 0, 0]]   # 0 for NEG, 1 for NEU, 2 for POS

                    #Negative
                    #Neutral
                    #Positive

# Compare all the new labels with the old ones
for i in range(len(new_label)):
  if new_label[i] != original_label[i]:
    count_diff += 1
    count_diff_class[original_label[i]][new_label[i]] += 1  #in this way i keep track of the change
  else:
    count_diff_class[original_label[i]][new_label[i]] += 1 

# Print results obtained
print("Total differences:", count_diff)
print("Total equal:", len(original_label) - count_diff)
print(f"Percentage changed: {int(count_diff/len(original_label) * 100)}%\n")

print("#Negative --> Positive:", count_diff_class[0][2])
print("#Negative --> Neutral:", count_diff_class[0][1], '\n')

print("#Neutral --> Positive:", count_diff_class[1][2])
print("#Neutral --> Negative:", count_diff_class[1][0], '\n')

print("#Positive --> Negative:", count_diff_class[2][0])
print("#Positive --> Neutral:", count_diff_class[2][1])

Total differences: 105044
Total equal: 365542
Percentage changed: 22%

#Negative --> Positive: 20673
#Negative --> Neutral: 46713 

#Neutral --> Positive: 24832
#Neutral --> Negative: 3478 

#Positive --> Negative: 2318
#Positive --> Neutral: 7030
[[89476, 46713, 20673], [3478, 128552, 24832], [2318, 7030, 147514]]
