<a href="https://colab.research.google.com/github/ferragina/MyInformationRetrieval/blob/main/TFIDF_vs_DNN_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

from datasets import load_dataset

# load the IMDB dataset
train_dataset = load_dataset("imdb", split="train")
test_dataset = load_dataset("imdb", split="test")

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
import torch
from transformers import AutoTokenizer, RwkvModel

# instantiate the tokenizer and model

model_name = "RWKV/rwkv-4-169m-pile"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RwkvModel.from_pretrained(model_name)

def generate_embeddings(dataset):
  """Generates embeddings for the input dataset.    """

  embeddings = []
  for n, row in enumerate(dataset):
    if (n > 64): # limit the number of created embeddings
      break
    if (n % 5 == 0):
      print(f"{n}/{len(dataset)}")
    inputs = tokenizer(row["text"], return_tensors="pt")
    with torch.no_grad():
      outputs = model(**inputs)
    embeddings.append({"embedding": outputs.last_hidden_state, "label": row["label"]})
  return embeddings

# generate train and test embeddings
train_embeddings = generate_embeddings(train_dataset)
test_embeddings = generate_embeddings(test_dataset)

tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/521 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/677M [00:00<?, ?B/s]

0/25000
5/25000
10/25000
15/25000
20/25000
25/25000
30/25000
35/25000
40/25000
45/25000
50/25000
55/25000
60/25000
0/25000
5/25000
10/25000
15/25000
20/25000
25/25000
30/25000
35/25000
40/25000
45/25000
50/25000
55/25000
60/25000


In [None]:
from sklearn.svm import SVC

# train an linear SVM classifier using the computed embeddings and the labelled training data
classifier = SVC(kernel="linear")
X = [row["embedding"][0,-1,:] for row in train_embeddings]

In [None]:
classifier.fit(X, train_dataset["label"])

In [None]:
from sklearn.metrics import accuracy_score

# predict on the test set
test_predictions = classifier.predict([row["embedding"][0,-1,:] for row in test_embeddings])

# score the classifier by percentage correct
accuracy = accuracy_score(test_dataset["label"], test_predictions)
print(f"accuracy: {accuracy}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create a TF-IDF vectorizer and transform the training data
vectorizer = TfidfVectorizer(max_features=10000)
train_vectors = vectorizer.fit_transform(train_dataset["text"])

# transform the test data using the same vectorizer
test_vectors = vectorizer.transform(test_dataset["text"])

# repeat the process - train a linear SVM classifier, predict, and evaluate
classifier = SVC(kernel="linear")
classifier.fit(train_vectors, train_dataset["label"])
test_predictions = classifier.predict(test_vectors)
accuracy = accuracy_score(test_dataset["label"], test_predictions)
print(f"accuracy: {accuracy}")