# Introducción a LanceDB
Exploraremos el uso de LanceDB para manejar grandes conjuntos de datos y realizar operaciones eficientes.

## Librerias

In [47]:
import lancedb
import pyarrow as pa
import pandas as pd
from datasets import load_dataset
from transformers import GPT2Tokenizer

## Extraer y embeddear los lyrics de las canciones
El dataset se encuentra en: https://huggingface.co/datasets/amishshah/song_lyrics?library=true

In [48]:
# Cargar el dataset de Hugging Face
dataset = load_dataset("amishshah/song_lyrics", split='train', streaming=True)
dataset = dataset.take(10000)

In [49]:
titles = []
lyrics = []

# Iterate over the dataset and collect data
for item in dataset:
    titles.append(item['title'])
    lyrics.append(item['lyrics'])

# Create a DataFrame
df = pd.DataFrame({'title': titles, 'lyrics': lyrics})

# Display the DataFrame
df.head()

Unnamed: 0,title,lyrics
0,Killa Cam,"[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki..."
1,Can I Live,"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,..."
2,Forgive Me Father,Maybe cause I'm eatin\nAnd these bastards fien...
3,Down and Out,[Produced by Kanye West and Brian Miller]\n\n[...
4,Fly In,"[Intro]\nSo they ask me\n""Young boy\nWhat you ..."


El modelo con su tokenizer se encuentra en: https://huggingface.co/openai-community/gpt2. Es importante notar que manejamos los vectores para que todos sean del mismo tamaño (2000 elementos). Si a alguno le falta, lo rellenamos con 0. 

In [50]:
# Cargar el tokenizer del modelo openai-community/gpt2 de Huggingface
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Embeddear las letras de las canciones
tokenized_lyrics = df['lyrics'].apply(lambda x: tokenizer(x)["input_ids"])

tokenized_lyrics = tokenized_lyrics.apply(lambda x: x[:2000] + [0] * (2000 - len(x)) if len(x) < 2000 else x[:2000])

# Create a new DataFrame with 'track_name' and the tokenized lyrics
df_tokenized = pd.DataFrame({'vector': tokenized_lyrics, 'name': df['title']})

df_tokenized.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (1417 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,vector,name
0,"[58, 1925, 15125, 25, 26049, 6542, 1222, 7298,...",Killa Cam
1,"[58, 11547, 771, 416, 5686, 85, 402, 26380, 60...",Can I Live
2,"[13300, 2728, 314, 1101, 4483, 259, 198, 1870,...",Forgive Me Father
3,"[58, 11547, 771, 416, 27775, 2688, 290, 8403, ...",Down and Out
4,"[58, 5317, 305, 60, 198, 2396, 484, 1265, 502,...",Fly In


Un ejemplo de un embedding de una canción y cómo podemos transformarlo al string nuevamente.  

In [51]:
# Obtener el embedding de la canción "Fake Plastic Trees"
embedding_fpt = df_tokenized[df_tokenized['name'] == 'Fake Plastic Trees']['vector'].values[0]
print("Embedding de la canción 'Fake Plastic Trees':", embedding_fpt, "\n")

# Generar el string a partir del embedding
original_string = tokenizer.decode(embedding_fpt, skip_special_tokens=True)
print("String generado a partir del embedding:", original_string)

Embedding de la canción 'Fake Plastic Trees': [58, 13414, 325, 352, 60, 198, 32, 4077, 7309, 41737, 460, 198, 1890, 257, 8390, 3999, 14239, 4618, 198, 818, 257, 8390, 7309, 4534, 198, 2504, 673, 5839, 422, 257, 14239, 582, 198, 818, 257, 3240, 1336, 286, 14239, 6134, 198, 2514, 651, 5755, 286, 2346, 198, 198, 58, 1925, 15125, 352, 60, 198, 1026, 17326, 607, 503, 198, 1026, 17326, 607, 503, 198, 1026, 17326, 607, 503, 198, 1026, 17326, 607, 503, 198, 198, 58, 13414, 325, 362, 60, 198, 3347, 3160, 351, 257, 5445, 582, 198, 32, 21368, 7514, 34365, 25924, 582, 198, 8241, 655, 1067, 25329, 290, 20246, 198, 1544, 973, 284, 466, 8185, 198, 1890, 4813, 287, 262, 3624, 444, 198, 1537, 13522, 1464, 7864, 198, 58, 1925, 15125, 362, 60, 198, 1870, 340, 17326, 683, 503, 198, 1026, 17326, 683, 503, 198, 1026, 17326, 683, 503, 198, 1026, 17326, 198, 198, 58, 13414, 325, 513, 60, 198, 3347, 3073, 588, 262, 1103, 1517, 198, 3347, 18221, 588, 262, 1103, 1517, 198, 3666, 8390, 7309, 1842, 198, 1537, 314,

## Meterlo a LanceDB

In [52]:
# Nos conectamos a una base de datos local
db = lancedb.connect("./.lancedb")
# Creamos una tabla en la base de datos
db.create_table("tabla", df_tokenized)
db["tabla"].head()

pyarrow.Table
vector: fixed_size_list<item: float>[2000]
  child 0, item: float
name: string
----
vector: [[[58,1925,15125,25,26049,...,0,0,0,0,0],[58,11547,771,416,5686,...,0,0,0,0,0],[13300,2728,314,1101,4483,...,0,0,0,0,0],[58,11547,771,416,27775,...,0,0,0,0,0],[58,5317,305,60,198,...,0,0,0,0,0]]]
name: [["Killa Cam","Can I Live","Forgive Me Father","Down and Out","Fly In"]]

### Queries
https://lancedb.github.io/lancedb/python/python/#lancedb.query.LanceVectorQueryBuilder

In [53]:
nueva_cancion = """
[Verse 1]
A green plastic watering can
For a fake Chinese rubber plant
In a fake plastic earth
That she bought from a rubber man
In a town full of rubber plants
To get rid of itself

[Chorus]
It wears her out
It wears her out
It wears her out
It wears her out

[Verse 2]
She lives with a broken man
A cracked polystyrene man
Who just crumbles and burns
He used to do surgery
For girls in the eighties
But gravity always wins

[Chorus]
And it wears him out
It wears him out
It wears him out
It wears

[Verse 3]
She looks like the real thing
She tastes like the real thing
My fake plastic love
But I can't help the feeling
I could blow through the ceiling
If I just turn and run

[Chorus]
And it wears me out
It wears me out
It wears me out
It wears me out

[Outro]
And if I could be who you wanted
If I could be who you wanted
All the time
All the time
"""

nueva_cancion_embedded = tokenizer(nueva_cancion)["input_ids"]

In [54]:
n = 2000
def ajustar_vector(input, n):
    output = input[:n]
    
    # Si la lista es más corta que el tamaño objetivo, rellenar con 0.0
    while len(output) < n:
        output.append(0)
    
    return output
nueva_cancion_embedded = ajustar_vector(nueva_cancion_embedded, n)

In [55]:
(db["tabla"].search(nueva_cancion_embedded)
    .metric("L2")
    .where("name != 'Fake Plastic Trees'")
    .select(["name", "vector"])
    .limit(10)
    .to_pandas())

Unnamed: 0,name,vector,_distance
0,Fake Plastic Trees,"[58.0, 13414.0, 325.0, 352.0, 60.0, 198.0, 32....",7698257000.0
1,Be Free,"[40.0, 1337.0, 546.0, 1842.0, 198.0, 1870.0, 7...",11623640000.0
2,Angry World,"[4366.0, 766.0, 1204.0, 355.0, 257.0, 5445.0, ...",11846920000.0
3,Winners,"[40.0, 11.0, 314.0, 1276.0, 307.0, 11.0, 314.0...",11913150000.0
4,Agitation,"[58.0, 5317.0, 305.0, 60.0, 198.0, 5812.0, 12....",12107570000.0
5,Song for Junior,"[10449.0, 345.0, 845.0, 881.0, 198.0, 198.0, 4...",12152910000.0
6,Serenity Prayer,"[13482.0, 11.0, 7264.0, 502.0, 262.0, 384.0, 9...",12321870000.0
7,Song for the Man,"[40.0, 836.0, 470.0, 588.0, 534.0, 9408.0, 11....",12474710000.0
8,Me You Album Version,"[8491.0, 345.0, 3492.0, 11.0, 466.0, 345.0, 76...",12528620000.0
9,The Biz vs. The Nuge,"[3198.0, 11.0, 734.0, 11.0, 1115.0, 986.0, 198...",12674170000.0
