In [1]:
import lancedb
import pyarrow as pa
import pandas as pd
from datasets import load_dataset
from transformers import GPT2Tokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Para este ejercicio descargamos una base de datos de mensajes de texto en inglés.

In [2]:
dataset = load_dataset("chirunder/text_messages")

In [3]:
df = pd.DataFrame(dataset['train'])
df.rename(columns={'text': 'texto'}, inplace=True)
df.head()

Unnamed: 0,texto
0,Top right I gained a little speed with the add...
1,They are heavier wheels though as are all the ...
2,Federally registering a trademark is more than...
3,I'll have to jog my memory from rooting a few ...
4,Unless you can afford to buy all new larger cl...


Para los siguientes ejercicios, voy a crear una variable del numero de palabras en cada mensaje de texto.

In [4]:
df['n'] = df['texto'].apply(lambda x: len(str(x).split()))
df = df[['n', 'texto']]
df.head()

Unnamed: 0,n,texto
0,13,Top right I gained a little speed with the add...
1,14,They are heavier wheels though as are all the ...
2,9,Federally registering a trademark is more than...
3,21,I'll have to jog my memory from rooting a few ...
4,10,Unless you can afford to buy all new larger cl...


## Task 1:
A partir del dataframe df, crea df_tokenized (usando el Tokenizer de GPT2) con dos columnas pero con el texto tokenizado. Asegurate de que todos los embeddings tengan la misma longitud y los tokens sean enteros (todos enteros o todos doubles). 

In [5]:
#Sólo tomamos primeros 5000
df=df.head(5000)

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenized_texts = df['texto'].apply(lambda x: tokenizer(x)["input_ids"])
tokenized_texts = tokenized_texts.apply(lambda x: x[:300] + [0] * (300 - len(x)) if len(x) < 300 else x[:300])
# Create a new DataFrame with 'track_name' and the tokenized lyrics
df_tokenized = pd.DataFrame({'vector': tokenized_texts, 'name': df['texto'], 'n':df['n']})
df_tokenized.head()



Unnamed: 0,vector,name,n
0,"[9126, 826, 314, 8618, 257, 1310, 2866, 351, 2...",Top right I gained a little speed with the add...,13
1,"[2990, 389, 20140, 13666, 996, 355, 389, 477, ...",They are heavier wheels though as are all the ...,14
2,"[37, 5702, 453, 28336, 257, 16028, 318, 517, 6...",Federally registering a trademark is more than...,9
3,"[40, 1183, 423, 284, 48342, 616, 4088, 422, 40...",I'll have to jog my memory from rooting a few ...,21
4,"[28042, 345, 460, 5368, 284, 2822, 477, 649, 4...",Unless you can afford to buy all new larger cl...,10


## Task 2:
Mete el dataframe a una tabla en una base de datos de LanceDB.

In [15]:
# Nos conectamos a una base de datos local
db = lancedb.connect("./.lancedb")
# Creamos una tabla en la base de datos
db.create_table("tabla1", df_tokenized)
db["tabla1"].head()

pyarrow.Table
vector: fixed_size_list<item: float>[300]
  child 0, item: float
name: string
n: int64
----
vector: [[[9126,826,314,8618,257,...,0,0,0,0,0],[2990,389,20140,13666,996,...,0,0,0,0,0],[37,5702,453,28336,257,...,0,0,0,0,0],[40,1183,423,284,48342,...,0,0,0,0,0],[28042,345,460,5368,284,...,0,0,0,0,0]]]
name: [["Top right I gained a little speed with the addition of IM heads.","They are heavier wheels though as are all the CV concave line from Vossen.","Federally registering a trademark is more than just aesthetics.","I'll have to jog my memory from rooting a few weeks ago, but it was a simple step from that point.","Unless you can afford to buy all new larger clothes!"]]
n: [[13,14,9,21,10]]

In [11]:
(db["tabla1"].search()
    .where("LENGTH(name) > 16")
    .select(["name", "vector"])
    .limit(11)
    .to_pandas())

Unnamed: 0,name,vector
0,Top right I gained a little speed with the add...,"[9126.0, 826.0, 314.0, 8618.0, 257.0, 1310.0, ..."
1,They are heavier wheels though as are all the ...,"[2990.0, 389.0, 20140.0, 13666.0, 996.0, 355.0..."
2,Federally registering a trademark is more than...,"[37.0, 5702.0, 453.0, 28336.0, 257.0, 16028.0,..."
3,I'll have to jog my memory from rooting a few ...,"[40.0, 1183.0, 423.0, 284.0, 48342.0, 616.0, 4..."
4,Unless you can afford to buy all new larger cl...,"[28042.0, 345.0, 460.0, 5368.0, 284.0, 2822.0,..."
5,Looking for a site that is legit that other us...,"[15784.0, 329.0, 257.0, 2524.0, 326.0, 318.0, ..."
6,No offence meant but you need to know where to...,"[2949.0, 14148.0, 4001.0, 475.0, 345.0, 761.0,..."
7,They are compact sized.,"[2990.0, 389.0, 16001.0, 19943.0, 13.0, 0.0, 0..."
8,"There certainly are more comfortable vehicles,...","[1858.0, 3729.0, 389.0, 517.0, 6792.0, 5672.0,..."
9,"You write well, and present yourself in an edu...","[1639.0, 3551.0, 880.0, 11.0, 290.0, 1944.0, 3..."


## Task 3:
Haz una query estilo SQL a la tabla de la base de datos. Quiero que escribas la query equivalente y pongas la explicación de lo que está haciendo la consulta. Hint: usa la columna "n". 

In [19]:
db['tabla1'].search().where("n>100").select(['name', 'n']).limit(15).to_pandas()

Unnamed: 0,name,n
0,they are still available and I found they made...,115


- Query en SQL equivalente:
SELECT name, n
FROM tabla1
WHERE n > 100
LIMIT 15;
- Explicacion: Busco los mensajes (y sus longitudes) que tengas más de 100 palabras, quiero nada más 15.

## Task 4:
Inventa un mensaje de texto que tu podrías escribirle a un amigo. Tokenizalo y ponlo en el formato adecuado para hacer un vector query. Quiero que me regreses el mensaje más parecido al mensaje que inventaste (OJO: quiero el texto, no el embedding). HINT: Hay que decodear el resultado del query.

In [20]:
mensaje= "I am going to drink water on Mo"
mensaje_tokenized = tokenizer(mensaje)["input_ids"]
mensaje_tokenized

[40, 423, 281, 2814, 9439]

In [23]:
#Lo hacemos de tamaño 300
mensaje_tokenized=mensaje_tokenized+((300-len(mensaje_tokenized))*[0])
mensaje_tokenized

[40,
 423,
 281,
 2814,
 9439,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [28]:
df_vector=(db["tabla1"].search(mensaje_tokenized)
    .metric("cosine") # Puede ser L2 o cosine
    .select(["name", "vector"])
    .limit(1)
    .to_pandas())
df_vector

Unnamed: 0,name,vector,_distance
0,I run a Adams piston kit on my gun.,"[40.0, 1057.0, 257.0, 12620.0, 41743.0, 6220.0...",0.012143


In [32]:
#Hice trampa guardando el mensaje, voy a decodear
mensaje_cercano_vector=df_vector['vector'][0]
mensaje_cercano = tokenizer.decode(mensaje_cercano_vector, skip_special_tokens=True)
print("El mensaje más cercano es:", mensaje_cercano)

El mensaje más cercano es: I run a Adams piston kit on my gun.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [33]:
#Vemos que aparecen muchos signos por el padding de ceros
mensaje_filtrado = [celda for celda in mensaje_cercano_vector if celda != 0]
mensaje_cercano_filtrado = tokenizer.decode(mensaje_filtrado, skip_special_tokens=True)
print("El mensaje más cercano es:", mensaje_cercano_filtrado)

El mensaje más cercano es: I run a Adams piston kit on my gun.
