<h1><center>How to train a Huggingface Tokenizer + TFIDF + RIDGE</center></h1>     

<center><img src = "https://i.imgur.com/iRX7hwu.png" width = "1000" height = "400"/></center>           

This notebook was inspided on the following other two notebooks:
* https://www.kaggle.com/vitaleey/tfidf-ridge
* https://www.kaggle.com/pablorosa01/naive-bayes-modeling-base-line

<h3 style='background:orange; color:black'><center>Consider upvoting this notebook if you found it helpful.</center></h3>

# Imports

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from bs4 import BeautifulSoup

from tqdm.auto import tqdm

## Load Datasets

In [2]:
TRAIN_DATA_PATH = "/home/albert/data/jigsaw/train.csv"
VALID_DATA_PATH = "/home/albert/data/jigsaw/validation_data.csv"
TEST_DATA_PATH = "/home/albert/data/jigsaw/comments_to_score.csv"

In [3]:
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_valid = pd.read_csv(VALID_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Scoring training data

In [4]:
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].mean(axis=1)

df_train['y'] = df_train['score']

min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=41)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train_new

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.32,1.5,0.16,0.0,0.64,0.0,0.436667,0.436667
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,0.32,0.0,0.00,0.0,0.00,0.0,0.053333,0.053333
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",0.32,0.0,0.00,0.0,0.00,0.0,0.053333,0.053333
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,0.32,0.0,0.16,0.0,0.64,1.5,0.436667,0.436667
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",0.32,0.0,0.16,0.0,0.64,0.0,0.186667,0.186667
...,...,...,...,...,...,...,...,...,...,...
137462,df957e1303ef8c68,"""\n\nRead more\nYou can read more by going to ...",0.00,0.0,0.00,0.0,0.00,0.0,0.000000,0.000000
82635,dd0b9714b24c3c66,Sounds like a deal. Thanks for hearing me out.,0.00,0.0,0.00,0.0,0.00,0.0,0.000000,0.000000
149206,58e756534c67c8e7,Bone to pick \n\nWhat makes you the rulemaker ...,0.00,0.0,0.00,0.0,0.00,0.0,0.000000,0.000000
87807,eade5e302541c3a9,"""\n\n Semi-vegetarianism section \n\nI like Fl...",0.00,0.0,0.00,0.0,0.00,0.0,0.000000,0.000000


# Train the tokenizer

In [17]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [26]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_train_new[['comment_text']])

def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["comment_text"]

In [27]:
raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [28]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

# Train the Model

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

In [10]:
def dummy_fun(doc):
    return doc

In [35]:
labels = df_train_new['y']
comments = df_train_new['comment_text']
tokenized_comments = tokenizer(comments.to_list())['input_ids']
tokenized_comments

vectorizer = TfidfVectorizer(
    analyzer = 'word',
    tokenizer = dummy_fun,
    preprocessor = dummy_fun,
    token_pattern = None)

comments_tr = vectorizer.fit_transform(tokenized_comments)
comments_tr.toarray()[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [12]:
regressor = Ridge(random_state=42, alpha=0.8)
regressor.fit(comments_tr, labels)

Ridge(alpha=0.8, random_state=42)

# Validation

In [13]:
# preprocess val data
less_toxic_comments = df_valid['less_toxic']
more_toxic_comments = df_valid['more_toxic']

less_toxic_comments = tokenizer(less_toxic_comments.to_list())['input_ids']
more_toxic_comments = tokenizer(more_toxic_comments.to_list())['input_ids']

less_toxic = vectorizer.transform(less_toxic_comments)
more_toxic = vectorizer.transform(more_toxic_comments)

# make predictions
y_pred_less = regressor.predict(less_toxic)
y_pred_more = regressor.predict(more_toxic)

(y_pred_less < y_pred_more).mean()


0.6684934236747708

* Tokenizer (deberta-v3): 0.6699880430450379
* Tokenizer (trained): 0.6674970107612594
* Tokenizer (trained + dirty): 0.6716819449980072

** Be careful, this results suggest that the 0.86 LB score is not reliable!!! Use at your own risk!

# Predictions and load submission.csv

In [14]:
texts = df_test['text']
texts = tokenizer(texts.to_list())['input_ids']
texts = vectorizer.transform(texts)

In [15]:
df_test['prediction'] = regressor.predict(texts)
df_test = df_test[['comment_id','prediction']]

df_test['score'] = df_test['prediction']
df_test = df_test[['comment_id','score']]

In [16]:
df_test.to_csv('./submission.csv', index=False)
df_test

Unnamed: 0,comment_id,score
0,114890,0.004819
1,732895,0.060385
2,1139051,-0.010972
3,1434512,-0.026002
4,2084821,0.056793
...,...,...
7532,504235362,0.058318
7533,504235566,0.043685
7534,504308177,0.004444
7535,504570375,0.034313
