Preprocessing

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np

In [None]:
drive.mount('/content/gdrive/')
# Read the data
data_identification = pd.read_csv("/content/gdrive/MyDrive/gideon/data_identification.csv")
data_emotion = pd.read_csv("/content/gdrive/MyDrive/gideon/emotion.csv")

# Merge data_identification and data_emotion based on tweet id
merged_data = pd.merge(data_identification, data_emotion, on=['tweet_id'], how='left')
merged_data.rename(columns={'emotion':'label'}, inplace=True)
merged_data.head()

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


Unnamed: 0,tweet_id,identification,label
0,0x28cc61,test,
1,0x29e452,train,joy
2,0x2b3819,train,joy
3,0x2db41f,test,
4,0x2a2acc,train,trust


In [None]:
import json
import pandas as pd

# Specify the path to your JSON file
file_path = '/content/gdrive/MyDrive/gideon/tweets_DM.json'

# Initialize lists to store data
tweet_ids = []
texts = []

# Read the JSON data from the file
with open(file_path, 'r') as file:
    data_lines = file.readlines()

# Process each line
for line in data_lines:
    try:
        # Parse the JSON using a custom object hook
        data = json.loads(line, object_hook=lambda d: {k: v if not isinstance(v, list) or len(v) > 0 else None for k, v in d.items()})

        # Extract the required fields
        tweet_id = data.get('_source', {}).get('tweet', {}).get('tweet_id', None)
        text = data.get('_source', {}).get('tweet', {}).get('text', None)

        # Append data to lists
        tweet_ids.append(tweet_id)
        texts.append(text)
    except json.JSONDecodeError:
        # Handle invalid JSON entries
        print(f"Skipping invalid JSON entry: {line.strip()}")

# Create a Pandas DataFrame
df = pd.DataFrame({
    'tweet_id': tweet_ids,
    'text': texts
})

merged_data = pd.merge(merged_data, df, on=['tweet_id'], how='left')


In [None]:
merged_data.head()

Unnamed: 0,tweet_id,identification,label,text
0,0x28cc61,test,,@Habbo I've seen two separate colours of the e...
1,0x29e452,train,joy,Huge Respect🖒 @JohnnyVegasReal talking about l...
2,0x2b3819,train,joy,Yoooo we hit all our monthly goals with the ne...
3,0x2db41f,test,,@FoxNews @KellyannePolls No serious self respe...
4,0x2a2acc,train,trust,@KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...


In [None]:
merged_data.to_pickle("/content/gdrive/MyDrive/gideon/merged_data_distilBERT.pkl")

In [None]:
drive.mount('/content/gdrive/')
# Read the data
merged_data = pd.read_pickle("/content/gdrive/MyDrive/gideon/merged_data_distilBERT.pkl")

Mounted at /content/gdrive/


In [None]:
# Separate merged_data to training and test data
train_data = merged_data[merged_data['identification'] == 'train']
test_data = merged_data[merged_data['identification'] == 'test']
train_data.drop(['identification'], axis=1, inplace=True)
test_data.drop(['identification'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(['identification'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(['identification'], axis=1, inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
numerical_labels = label_encoder.fit_transform(train_data['label'])

train_data['label'] = numerical_labels

# To see the mapping between original labels and numerical values, you can use classes_ attribute
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {'anger': 0, 'anticipation': 1, 'disgust': 2, 'fear': 3, 'joy': 4, 'sadness': 5, 'surprise': 6, 'trust': 7}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['label'] = numerical_labels


In [None]:
train_data.head()

Unnamed: 0,tweet_id,label,text
1,0x29e452,4,Huge Respect🖒 @JohnnyVegasReal talking about l...
2,0x2b3819,4,Yoooo we hit all our monthly goals with the ne...
4,0x2a2acc,7,@KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...
5,0x2a8830,4,Come join @ambushman27 on #PUBG while he striv...
6,0x20b21d,1,@fanshixieen2014 Blessings!My #strength little...


Splitting dataset

In [None]:
from sklearn.model_selection import train_test_split
# Splitting train_data to train_data and val_data
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [None]:
# # Random undersampling
# from imblearn.under_sampling import RandomUnderSampler

# X_train = train_data.drop('label', axis=1)
# y_train = train_data['label']

# rus = RandomUnderSampler(random_state=0)
# X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# # Creating a new DataFrame with the resampled data
# resampled_data = pd.concat([pd.DataFrame(X_resampled, columns=X_train.columns), pd.DataFrame(y_resampled, columns=['label'])], axis=1)

# # Display the resampled DataFrame
# print(resampled_data)

        tweet_id                                               text  label
0       0x242d34  Cant believe an older client claimed #MYPHOTOS...      0
1       0x1d7155  @foxfriendsfirst I am becoming racist!! sick o...      0
2       0x35dc4d  Literally just wasted 9$ on a car wash bc all ...      0
3       0x2ce684                        Took an L on that test <LH>      0
4       0x27ff00  @SamsungMobile @O2  free wireless charger offe...      0
...          ...                                                ...    ...
255219  0x374f26  #ItsAnUglyRealityBut we must accept what we ca...      7
255220  0x1d7743                  @DonnieWahlberg Always <LH> ❤❤❤❤❤      7
255221  0x293f1a        ##grateful for family life and friends.....      7
255222  0x29b9cd  @worldskillsuk @iansnedden @philDixon12 very w...      7
255223  0x2d41a7  We’ve got digital hugs, and plenty of thanks c...      7

[255224 rows x 3 columns]


In [None]:
train_tweet_id = train_data['tweet_id']
val_tweet_id = val_data['tweet_id']
test_tweet_id = test_data['tweet_id']
train_data.drop(['tweet_id'], axis=1, inplace=True)
val_data.drop(['tweet_id'], axis=1, inplace=True)
test_data.drop(['tweet_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(['tweet_id'], axis=1, inplace=True)


In [None]:
import panel as pn
import warnings; warnings.filterwarnings('ignore')

def show_panel(df):
    return pn.widgets.Tabulator(df.head(20),
                    show_index=False,
                    pagination='local',
                         page_size=10)

pn.extension('tabulator')
pn.widgets.Tabulator.theme = 'bootstrap'

print('Dataset information:')
print(f'Training data: {train_data.shape}')
print(f'Validation data: {val_data.shape}')
print(f'Test data: {test_data.shape}')

Dataset information:
Training data: (1164450, 2)
Validation data: (291113, 2)
Test data: (411972, 2)


In [None]:
train_data.dtypes

label     int64
text     object
dtype: object

Combine DataFrames into HuggingFace's Dataset

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
from datasets import Dataset,DatasetDict,Features,Value,ClassLabel

# Drop the index column if it's present
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
val_data['label'].fillna('-1', inplace=True)

# Don't forget the class label data
class_names = ['anger', 'anticipation', 'disgust', 'fear', 'sadness', 'surprise', 'trust', 'joy']
ft = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

# Combine Multiple Datasets
emotions = DatasetDict({
    "train": Dataset.from_pandas(train_data,features=ft),
    "validation": Dataset.from_pandas(val_data,features=ft),
    "test": Dataset.from_pandas(test_data,features=ft)
    })

# Convert a single DataFrame to a Dataset
# emotions = Dataset.from_pandas(train,features=ft)
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1164450
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 291113
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 411972
    })
})

In [None]:
# Training Data
train_ds = emotions["train"]
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 1164450
})

In [None]:
# Convert Dataset to DataFrame (don't forget to reset)
emotions.set_format(type="pandas")
df = emotions["train"][:]
show_panel(df)

In [None]:
# Add label data to dataframe
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
show_panel(df)

Tokenization

In [None]:
from transformers import AutoTokenizer

text = 'Tokenisation of text is a core task of NLP.'

# Load parameters of the tokeniser
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Show tokeniser information
tokenizer

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
print('')
print(f'Vocab size: {tokenizer.vocab_size}')
print(f'Max length: {tokenizer.model_max_length}')
print(f'Tokeniser model input names: {tokenizer.model_input_names}')


Vocab size: 30522
Max length: 512
Tokeniser model input names: ['input_ids', 'attention_mask']


In [None]:
emotions.reset_format()

In [None]:
# Tokenisation function
def tokenise(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

# apply to the entire dataset (train,test and validation dataset)
emotions_encoded = emotions.map(tokenise, batched=True, batch_size=100000)
print(emotions_encoded["train"].column_names)

Map:   0%|          | 0/1164450 [00:00<?, ? examples/s]

Map:   0%|          | 0/291113 [00:00<?, ? examples/s]

Map:   0%|          | 0/411972 [00:00<?, ? examples/s]

['text', 'label', 'input_ids', 'attention_mask']


Training a text classifier

1. Transformers as feature extractors

In [None]:
from transformers import AutoModel
import torch

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
def extract_hidden_states(batch):

    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}

    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state

    # Return vector for [CLS] token
    cls_hidden_state = last_hidden_state[:, 0].cpu().numpy()

    # Explicitly delete unused variables to free up GPU memory
    del last_hidden_state

    # Clear GPU memory
    torch.cuda.empty_cache()

    # Return vector for [CLS] token
    return {"hidden_state": cls_hidden_state}

emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Extract last hidden states
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)
emotions_hidden["train"].column_names

Map:   0%|          | 0/1164450 [00:00<?, ? examples/s]

In [None]:
# Creating the feature matrix
X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
print(f'Training Dataset: {X_train.shape}')
print(f'Validation Dataset {X_valid.shape}')

In [None]:
# Let's check our dataset
X_train

In [None]:
# Visualising the training data
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE


# Scale the data
X_scaled = MinMaxScaler().fit_transform(X_train)

# lower dimension transformation
model = TSNE(n_components=2).fit(X_scaled)

# Create a df of 2D embeddings
df_embedding = pd.DataFrame(model.embedding_, columns=["X", "Y"])
df_embedding["label"] = y_train

# Plot TSNE
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='whitegrid')

fig, axes = plt.subplots(2, 3, figsize=(15,10))
axes = axes.flatten()
labels = emotions["train"].features["label"].names

for i, label in enumerate(labels):

    dict_embedding_sub = dict(tuple(df_embedding.groupby('label')))
    df_embedding_sub = dict_embedding_sub[i]

    axes[i].scatter(df_embedding_sub["X"],
                    df_embedding_sub["Y"],
                    lw=1,ec='k',alpha=0.2)

    axes[i].set_title(f'{label}')

plt.tight_layout()
plt.show()

In [None]:
# Training baseline model
from sklearn.linear_model import LogisticRegression as LR

# We increase `max_iter` to guarantee convergence
lr_clf = LR(max_iter = 2000)
lr_clf.fit(X_train, y_train)
y_preds = lr_clf.predict(X_valid)
print(f'accuracy: {lr_clf.score(X_valid, y_valid)}')

In [None]:
# Plot confusion matrix

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_model, y_true, labels):
    cm = confusion_matrix(y_true,y_model,normalize='true')
    fig, ax = plt.subplots(figsize=(7,7))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(ax=ax, colorbar=False)
    plt.title("Confusion matrix")
#     plt.axis('off')
    plt.grid(False)
    plt.show()

plot_confusion_matrix(y_preds, y_valid, labels)

Model Error Analysis

In [None]:
emotions_encoded["test"]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 411972
})

In [None]:
len(emotions_encoded["test"]["text"])

411972

In [None]:
from transformers import pipeline

# load from previously saved model
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-emotion")

# Predict test data
preds = classifier(emotions_encoded["test"]["text"])
preds

KeyboardInterrupt: ignored

In [None]:
# Extract only the label numbers
y_preds = [int(prediction['label'].split('_')[1]) for prediction in preds]

class_mapping = {0: 'anger', 1: 'anticipation', 2: 'disgust', 3: 'fear',
                 4: 'sadness', 5: 'surprise', 6: 'trust', 7: 'joy'}
# Convert list of numbers to list of class names
y_preds = [class_mapping[pred] for pred in y_preds]

# Create submission dataframe
sub_df = pd.DataFrame({'tweet_id': test_tweet_id, 'prediction': y_preds})

In [None]:
# Rename the columns
sub_df.rename(columns={'tweet_id':'id', 'prediction':'emotion'}, inplace=True)
# Reset the index, removing the current index
sub_df = sub_df.reset_index(drop=True)

In [None]:
# Convert submission dataframe to csv
sub_df.to_csv("/content/gdrive/MyDrive/gideon/submission_3.csv", index=False)

NameError: ignored