In [1]:
!pip install datasets
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [36]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
import regex as re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from google.colab import files
from google.colab import drive



In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def remove_emoji(comment):
    """Function to remove emojis.
        comment : data input ; str
        Taken from :
        https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b

    """

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', comment)

In [5]:
def P_data_cleaning(data, language, labelling):
    """Function to clean our data.
       data : data input ; pd.Series
       language : what language the comments are in (input in lowercase) : str
       labelling : if we want to label, we keep punctuation & stopwords
    """

    # REMOVE NAN ENTRIES
    data = data.dropna()

    # REMOVE COMMENTS THAT EXCEED CERTAIN LENGTH (350 for now)
    data = data[data.str.len() <= 350]


    # FOR GERMAN DATA : Change ö , ä , ü to oe, ae, ue
    data = data.str.replace("ö", "oe").str.replace("ä", "ae").str.replace("ü", "ue")

    # REMOVE NAMES FROM ANSWERS (in youtube comments scraper answers stored by @@)
    data = data.str.replace('@@\w+', '', regex=True)

    # REMOVING PUNCTUATION
    if labelling == False:
      data = data.str.replace('[^a-zA-Z0-9]',' ')

    # REMOVING EMOJIS
    data = data.apply(lambda x: remove_emoji(x))

    # LOWERCASE
    data = data.str.lower()

    # REMOVING STOPWORDS
    if labelling == False:
      data = data.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words(language))]))


    return data

In [6]:
# Instantiate the tokenizer for multilingual data
tokenizer = DistilBertTokenizer.from_pretrained('philschmid/distilbert-base-multilingual-cased-sentiment')


def tokenize_function(examples, tokenizer=tokenizer):
    """
    Function to tokenize the data.
    examples : data to tokenize ; dict
    tokenizer : tokenizer to use ; DistilBertTokenizer
    """
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [29]:
def DistilBertModel(train_comments, train_labels,
                    val_comments, val_labels,
                    batch_size_train, batch_size_val,
                    epochs, num_labels, tokenizer=tokenizer):
    """
    Function to train a DistilBert model on the data.
    train_comments : comments for training ; lst of str
    train_labels : labels for training ; lst of int
    val_comments : comments for validation ; lst of str
    val_labels : labels for validation ; lst of int
    batch_size_train : batch size for training ; int
    batch_size_val : batch size for validation ; int
    epochs : number of epochs ; int
    num_labels : number of labels (for denoiser 2, for classification 3) ; int
    tokenizer : tokenizer to use ; DistilBertTokenizer
    """

    model = DistilBertForSequenceClassification.from_pretrained('philschmid/distilbert-base-multilingual-cased-sentiment', num_labels=num_labels)


    # Setup the Hugging Face Dataset Class
    train_dataset_dict = {"text": train_comments, "label": train_labels}
    val_dataset_dict = {"text": val_comments, "label": val_labels}

    train_dataset = Dataset.from_dict(train_dataset_dict)
    val_dataset = Dataset.from_dict(val_dataset_dict)

    # Apply the tokenizer to the datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Remove columns we do not need for training
    train_dataset = train_dataset.remove_columns(["text"])
    val_dataset = val_dataset.remove_columns(["text"])

    # Set the format of the datasets to PyTorch tensors
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")


    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=epochs,              # total number of training epochs
        per_device_train_batch_size=batch_size_train,  # batch size for training
        per_device_eval_batch_size=batch_size_val,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.001,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps",     # Evaluate every `eval_steps`
        eval_steps=10,                   # Number of steps between evaluations
        save_steps=10,                   # Save the model every `save_steps`
        load_best_model_at_end=True,     # Load the best model at the end of training#
        learning_rate= 1e-4,              # Set the learning rate
        metric_for_best_model="eval_loss", # Use evaluation loss to check how good our model is performing
        greater_is_better=False,
    )

    # Trainer
    trainer = Trainer(
        model=model,                         # model
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Early Stopping for Overfitting
    )

    # Train the model
    trainer.train()



    return model, tokenizer

In [8]:
def save_model(model, tokenizer, path):
    """
    Function to save the model
    model : model to save ; DistilBertForSequenceClassification
    tokenizer : tokenizer to save ; DistilBertTokenizer
    path : path to save the model ; str
    """

    model_save_path =  path
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

In [9]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
uploaded = files.upload()

Saving andrea.csv to andrea.csv
Saving giovanni.csv to giovanni.csv
Saving Giuseppe.csv to Giuseppe.csv


In [10]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
path = '/content/'

In [11]:
# Load the datasets
# Note that mac users seperated with , automatically, for windows users we have to specify ; as the seperator
english_test_dataset_labelled = pd.read_csv(path +'Giuseppe.csv')
english_test_dataset_labelled_2 = pd.read_csv(path + 'andrea.csv', sep= ';')
english_test_dataset_labelled_3 = pd.read_csv(path + 'giovanni.csv', sep= ';')


In [12]:
# Since the ones downloaded from windows are a bit messed up, we need to rename columns
english_test_dataset_labelled_2.columns = ["Comment", "Label"]
english_test_dataset_labelled_3.columns = ["Comment", "Label", "unnamed", "unnamed2"]

In [13]:
# Concatenate the datasets
english_test_dataset_labelled = pd.concat([english_test_dataset_labelled, english_test_dataset_labelled_2, english_test_dataset_labelled_3], ignore_index= True)


In [14]:
# Convert type of each comment to string
english_test_dataset_labelled['Comment'] = english_test_dataset_labelled['Comment'].astype(str)

In [15]:
# Remove comments that are not labelled (have NaN value in column 'Label')
english_test_dataset_labelled = english_test_dataset_labelled.dropna(subset=['Label'])
# Drop noisy comments, as they are not useful for training
english_test_dataset_labelled = english_test_dataset_labelled[english_test_dataset_labelled['Label'] != 'N']

In [16]:
# Do preprocessing steps
english_test_dataset_labelled['Comment'] = P_data_cleaning(english_test_dataset_labelled['Comment'], 'english', False)

In [17]:
# Seperate the two columns in the dataframe into 'comment' and 'label' in form of two lists
english_test_dataset_labelled_comments = english_test_dataset_labelled['Comment'].tolist()
english_test_dataset_labelled_labels = english_test_dataset_labelled['Label'].tolist()

In [18]:
# Convert the elements in the list to integers, handling non-integer values
for idx,label in enumerate(english_test_dataset_labelled_labels):
    try:
        english_test_dataset_labelled_labels[idx] = int(label)
    except ValueError:
        # Handle the case where the label is not an integer
        english_test_dataset_labelled_labels[idx] = None

In [19]:
# Assess how many labelled comments we have
print("We have ", len(english_test_dataset_labelled_comments), " labelled comments in our dataset.")

# Check how many negative (-1), neutral (0) and positive (1) comments we have
print("We have ", english_test_dataset_labelled_labels.count(-1), " negative comments.")
print("We have ", english_test_dataset_labelled_labels.count(0), " neutral comments.")
print("We have ", english_test_dataset_labelled_labels.count(1), " positive comments.")

# Assert that the number of comments and labels are the same
assert len(english_test_dataset_labelled_comments) == len(english_test_dataset_labelled_labels), "The number of comments and labels are not the same."

We have  3418  labelled comments in our dataset.
We have  1385  negative comments.
We have  891  neutral comments.
We have  1128  positive comments.


In [20]:
# Even out the datasets

# Find the minimum number of comments in a category
min_comments = min(english_test_dataset_labelled_labels.count(-1), english_test_dataset_labelled_labels.count(0), english_test_dataset_labelled_labels.count(1))

# Seperate the 3 categories
negative_comments = []
neutral_comments = []
positive_comments = []

for i in range(len(english_test_dataset_labelled_labels)):
    if english_test_dataset_labelled_labels[i] == -1:
        negative_comments.append(english_test_dataset_labelled_comments[i])
    elif english_test_dataset_labelled_labels[i] == 0:
        neutral_comments.append(english_test_dataset_labelled_comments[i])
    elif english_test_dataset_labelled_labels[i] == 1:
        positive_comments.append(english_test_dataset_labelled_comments[i])

negative_comments = negative_comments[:min_comments]
neutral_comments = neutral_comments[:min_comments]
positive_comments = positive_comments[:min_comments]

In [21]:
# Now that we have evened out the dataset, we can concatenate the lists
comments = negative_comments + neutral_comments + positive_comments

# Create the labels for the evened out dataset
labels = [0]*min_comments + [1]*min_comments + [2]*min_comments

In [22]:
# Turn all elements in comments into strings
comments = [str(comment) for comment in comments]

# Check that all values in comments are strings
for comment in comments:
    assert type(comment) == str, "All comments should be strings."

In [23]:
# Split the data into training and validation sets with stratification
train_comments, val_comments, train_labels, val_labels = train_test_split(
    comments, labels, test_size=0.2, random_state=42, stratify=labels
)

In [24]:
print("We have {} training comments".format(len(train_comments)))
print("We have {} validation comments".format(len(val_comments)))

# Check that the classes are evenly distributed across training and validation sets
print("Training set:")
print("Negative comments:", train_labels.count(0))
print("Neutral comments:", train_labels.count(1))
print("Positive comments:", train_labels.count(2))
print("Validation set:")
print("Negative comments:", val_labels.count(0))
print("Neutral comments:", val_labels.count(1))
print("Positive comments:", val_labels.count(2))

We have 2138 training comments
We have 535 validation comments
Training set:
Negative comments: 713
Neutral comments: 713
Positive comments: 712
Validation set:
Negative comments: 178
Neutral comments: 178
Positive comments: 179


In [30]:
# Using our evened out dataset, we can start applying the model
model_trained, tokenizer_trained = DistilBertModel(train_comments, train_labels, val_comments, val_labels, batch_size_train = 16, batch_size_val = 16, num_labels = 3, epochs = 5, tokenizer = tokenizer)
save_model(model_trained, tokenizer_trained, path + "sentiment_model_fine_tuned_distilbert_english")

Map:   0%|          | 0/2138 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
10,1.888,1.978658
20,1.8901,1.845963
30,1.5401,1.665363
40,1.3959,1.486136
50,1.2361,1.34567
60,1.1905,1.256159
70,1.1691,1.199507
80,1.1439,1.16109
90,1.1002,1.149904
100,1.0815,1.134592


In [35]:
# Save the fine-tuned model to your system

# Mount to drive
drive.mount('/content/drive')

!cp -r /content/sentiment_model_fine_tuned_distilbert_english /content/drive/MyDrive/

# Now download it from your Google Drive Account !

Mounted at /content/drive
