# TF-IDF Testing

Here we will explore the TF-IDF algorithm implemented in sklearn.

In [1]:
from datasets import load_dataset, Dataset

#Converts data in src [TAB] tgt [NEWLINE] format to a format suitable for model training
def convertToDictFormat(data):
    source = []
    target = []
    for example in data:
        example = example.strip()
        sentences = example.split("\t")
        source.append(sentences[0])
        target.append(sentences[1])
    ready = Dataset.from_dict({"en":source, "fr":target})
    return ready

In [2]:
train_data = load_dataset("ethansimrm/wmt_16_19_22_biomed_train_processed", split = "train")
train_data_ready = convertToDictFormat(train_data['text'])

Found cached dataset text (C:/Users/ethan/.cache/huggingface/datasets/ethansimrm___text/ethansimrm--wmt_16_19_22_biomed_train_processed-8662b34233d7661e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [8]:
corpus = train_data_ready['fr']

In [12]:
vectorizer = TfidfVectorizer(stop_words = list(fr_stop))
word_table = vectorizer.fit_transform(corpus)

In [17]:
print(word_table.shape)

(649192, 174470)


In [30]:
import numpy as np
words = np.array(vectorizer.get_feature_names_out())
weights = np.array(vectorizer.idf_)

In [33]:
import pandas as pd
words_and_weights = pd.DataFrame({"word":words, "weight":weights})

In [37]:
#The higher the weight, the better!
words_and_weights_sorted = words_and_weights.sort_values(by = "weight", ascending = False).reset_index(drop = True)

In [39]:
words_and_weights_sorted.to_csv("tfidf_words_and_weights.txt", sep = "\t", index = False, header = False)

In [40]:
#Converts TFIDF data in src [TAB] tgt [NEWLINE] format to a format suitable for model training
def extractTFIDFWords(data):
    words = []
    for example in data:
        example = example.strip()
        sentences = example.split("\t")
        words.append(sentences[0]) #Discard scores
    return words

In [41]:
tfidf_words = load_dataset("ethansimrm/train_tfidf_words_and_weights", split = "train")
tfidf_words_ready = extractTFIDFWords(tfidf_words['text'])

Downloading and preparing dataset text/ethansimrm--train_tfidf_words_and_weights to C:/Users/ethan/.cache/huggingface/datasets/ethansimrm___text/ethansimrm--train_tfidf_words_and_weights-a681a8b40b589f3e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.20M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to C:/Users/ethan/.cache/huggingface/datasets/ethansimrm___text/ethansimrm--train_tfidf_words_and_weights-a681a8b40b589f3e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


In [44]:
from transformers import AutoTokenizer
base_tok = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
big_tok = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-fr")
SPECIAL_BASE_TOKENS = [0, 1, 59513]
SPECIAL_BIG_TOKENS = [43311, 50387, 43312, 53016] 

In [53]:
def getOrderedTokenList(words, model_tokeniser, special_tokens):
    found_tok = set()
    ordered_tok = []
    for word in words:
        tokenised_word = model_tokeniser(text_target=word)["input_ids"]
        for tok in tokenised_word:
            if tok in special_tokens:
                continue
            elif tok not in found_tok:
                ordered_tok.append(tok)
                found_tok.add(tok)
    return ordered_tok

In [54]:
getOrderedTokenList(tfidf_words_ready, base_tok, SPECIAL_BASE_TOKENS)

[49,
 4980,
 5805,
 22843,
 1354,
 376,
 4505,
 9997,
 357,
 7278,
 12607,
 247,
 962,
 51,
 27775,
 26381,
 607,
 114,
 14710,
 3755,
 26001,
 2444,
 8407,
 24214,
 6790,
 30385,
 124,
 12346,
 23020,
 122,
 4232,
 7823,
 1064,
 35070,
 2073,
 111,
 26624,
 21769,
 23277,
 8551,
 3275,
 25464,
 15986,
 16702,
 9,
 11875,
 1163,
 2913,
 253,
 3491,
 15508,
 63,
 8310,
 6605,
 539,
 395,
 816,
 673,
 13196,
 2470,
 5517,
 190,
 7620,
 2501,
 108,
 5966,
 20813,
 7963,
 8030,
 5783,
 31742,
 26329,
 22265,
 16066,
 10059,
 16800,
 5086,
 4952,
 2049,
 4582,
 6411,
 23809,
 23340,
 17121,
 1098,
 75,
 7574,
 756,
 93,
 6161,
 23538,
 162,
 3805,
 1054,
 597,
 10814,
 26156,
 670,
 623,
 139,
 966,
 5612,
 1744,
 1629,
 3553,
 11130,
 1941,
 12729,
 222,
 10360,
 243,
 263,
 1295,
 4529,
 3498,
 32068,
 313,
 8468,
 3905,
 10819,
 3394,
 8619,
 1662,
 3103,
 18487,
 16512,
 12868,
 5919,
 11969,
 7101,
 1686,
 21469,
 20941,
 6981,
 22641,
 4189,
 4077,
 1597,
 3456,
 18297,
 7128,
 514,
 

In [55]:
getOrderedTokenList(tfidf_words_ready, big_tok, SPECIAL_BIG_TOKENS)

[104,
 52916,
 8081,
 50680,
 51929,
 28664,
 22474,
 27563,
 35569,
 17117,
 26265,
 34022,
 19741,
 17928,
 25312,
 16795,
 17136,
 18480,
 36348,
 48720,
 5857,
 5847,
 1893,
 42827,
 48791,
 17131,
 3298,
 22366,
 27595,
 17435,
 34646,
 16894,
 50671,
 43310,
 13770,
 16661,
 13837,
 9339,
 39544,
 17241,
 17122,
 34039,
 40028,
 25072,
 8580,
 18735,
 18435,
 9354,
 3594,
 3537,
 26595,
 36970,
 19951,
 19963,
 34713,
 3491,
 3395,
 36310,
 11041,
 18999,
 19957,
 48651,
 17692,
 48699,
 5244,
 48703,
 18883,
 5805,
 25174,
 18008,
 49611,
 18446,
 40360,
 26503,
 14983,
 19948,
 30244,
 22930,
 13865,
 8624,
 25423,
 21972,
 16224,
 16344,
 48722,
 13647,
 13642,
 19005,
 39519,
 10045,
 16562,
 40001,
 50129,
 6077,
 21792,
 26263,
 19214,
 22533,
 17143,
 5506,
 52714,
 13290,
 47630,
 17341,
 49918,
 39997,
 330,
 1646,
 47546,
 23141,
 7569,
 31417,
 25058,
 17665,
 16803,
 48710,
 19301,
 33907,
 23696,
 29030,
 49003,
 25851,
 3469,
 47542,
 42660,
 15416,
 34185,
 34212,
