<a href="https://colab.research.google.com/github/ferragina/MyInformationRetrieval/blob/main/TFIDF_vs_DNN_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

from datasets import load_dataset

# load the IMDB dataset
train_dataset = load_dataset("imdb", split="train")
test_dataset = load_dataset("imdb", split="test")

In [None]:
import torch
from transformers import AutoTokenizer, RwkvModel

# instantiate the tokenizer and model

model_name = "RWKV/rwkv-4-169m-pile"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RwkvModel.from_pretrained(model_name)

def generate_embeddings(dataset):
  """Generates embeddings for the input dataset.    """

  embeddings = []
  for n, row in enumerate(dataset):
    if (n > 64): # limit the number of created embeddings
      break
    if (n % 5 == 0):
      print(f"{n}/{len(dataset)}")
    inputs = tokenizer(row["text"], return_tensors="pt")
    with torch.no_grad():
      outputs = model(**inputs)
    embeddings.append({"embedding": outputs.last_hidden_state, "label": row["label"]})
  return embeddings

# generate train and test embeddings
train_embeddings = generate_embeddings(train_dataset)
test_embeddings = generate_embeddings(test_dataset)

In [None]:
from sklearn.svm import SVC

# train an linear SVM classifier using the computed embeddings and the labelled training data
classifier = SVC(kernel="linear")
X = [row["embedding"][0,-1,:] for row in train_embeddings]

In [None]:
classifier.fit(X, train_dataset["label"])

In [None]:
from sklearn.metrics import accuracy_score

# predict on the test set
test_predictions = classifier.predict([row["embedding"][0,-1,:] for row in test_embeddings])

# score the classifier by percentage correct
accuracy = accuracy_score(test_dataset["label"], test_predictions)
print(f"accuracy: {accuracy}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create a TF-IDF vectorizer and transform the training data
vectorizer = TfidfVectorizer(max_features=10000)
train_vectors = vectorizer.fit_transform(train_dataset["text"])

# transform the test data using the same vectorizer
test_vectors = vectorizer.transform(test_dataset["text"])

# repeat the process - train a linear SVM classifier, predict, and evaluate
classifier = SVC(kernel="linear")
classifier.fit(train_vectors, train_dataset["label"])
test_predictions = classifier.predict(test_vectors)
accuracy = accuracy_score(test_dataset["label"], test_predictions)
print(f"accuracy: {accuracy}")