Imports

In [129]:
import pandas as pd
import numpy as np
import faiss
import requests

In [130]:
df =pd.read_csv("data.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


create the textual representation for the LLM 

In [131]:
def create_textual_rep(row):
    text_rep=f"""
    Type:{row['type']},
    Title:{row['title']},
    Director:{row['director']},
    Description:{row['description']},
    Genres:{row['listed_in']},
    Released:{row['release_year']}
    """
    return text_rep

In [132]:
df['text_rep']=df.apply(create_textual_rep,axis=1)

In [133]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,text_rep
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...","\n Type:Movie,\n Title:Dick Johnson Is D..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","\n Type:TV Show,\n Title:Blood & Water,\..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,"\n Type:TV Show,\n Title:Ganglands,\n ..."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...","\n Type:TV Show,\n Title:Jailbirds New O..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,"\n Type:TV Show,\n Title:Kota Factory,\n..."


In [134]:
dim = 4096
index = faiss.IndexFlatL2(dim)
X=np.zeros((len(df['text_rep']),dim),dtype='float32')

In [135]:
print(df['text_rep'])

0       \n    Type:Movie,\n    Title:Dick Johnson Is D...
1       \n    Type:TV Show,\n    Title:Blood & Water,\...
2       \n    Type:TV Show,\n    Title:Ganglands,\n   ...
3       \n    Type:TV Show,\n    Title:Jailbirds New O...
4       \n    Type:TV Show,\n    Title:Kota Factory,\n...
                              ...                        
8802    \n    Type:Movie,\n    Title:Zodiac,\n    Dire...
8803    \n    Type:TV Show,\n    Title:Zombie Dumb,\n ...
8804    \n    Type:Movie,\n    Title:Zombieland,\n    ...
8805    \n    Type:Movie,\n    Title:Zoom,\n    Direct...
8806    \n    Type:Movie,\n    Title:Zubaan,\n    Dire...
Name: text_rep, Length: 8807, dtype: object


In [136]:
# for i, representation in enumerate(df['text_rep']):
#     if i % 20 == 0:
#         print('Processed',str(i),'instances')
#     res = requests.post('http://localhost:11434/api/embeddings',
#         json={
#             'model':'llama2',
#             'prompt': representation
#         }
#     )
#     embedding = res.json()['embedding']
#     X[i]=np.array(embedding)
# index.add(X)

In [137]:
import asyncio
import aiohttp
import nest_asyncio
import numpy as np
from tqdm.notebook import tqdm  # for nice Jupyter progress bars

# Patch the event loop for Jupyter
nest_asyncio.apply()

# Parameters
url = 'http://localhost:11434/api/embeddings'
model = 'llama2'
concurrency_limit = 100  # Tweak based on your system
expected_dim = dim      # Change based on your model output size
X = np.zeros((len(df), expected_dim), dtype=np.float32)

sem = asyncio.Semaphore(concurrency_limit)

async def fetch_embedding(session, i, prompt):
    async with sem:
        for _ in range(3):  # Retry logic
            try:
                async with session.post(url, json={'model': model, 'prompt': prompt}, timeout=10) as resp:
                    if resp.status == 200:
                        data = await resp.json()
                        embedding = data.get('embedding')
                        if embedding:
                            return i, np.array(embedding, dtype=np.float32)
            except Exception:
                await asyncio.sleep(0.5)  # Backoff
    print(f"❌ Failed to get embedding for index {i}")
    return i, None

async def process_all():
    tasks = []
    async with aiohttp.ClientSession() as session:
        for i, prompt in enumerate(df['text_rep']):
            tasks.append(fetch_embedding(session, i, prompt))

        for f in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            i, emb = await f
            if emb is not None:
                X[i] = emb

    print("✅ Done embedding all.")

# Run the async function in the current event loop
await process_all()



  0%|          | 0/8807 [00:00<?, ?it/s]

❌ Failed to get embedding for index 253
❌ Failed to get embedding for index 4310
❌ Failed to get embedding for index 249
❌ Failed to get embedding for index 5829
❌ Failed to get embedding for index 7545
❌ Failed to get embedding for index 245
❌ Failed to get embedding for index 2098
❌ Failed to get embedding for index 679
❌ Failed to get embedding for index 4403
❌ Failed to get embedding for index 4396
❌ Failed to get embedding for index 5837
❌ Failed to get embedding for index 7496
❌ Failed to get embedding for index 4399
❌ Failed to get embedding for index 5816
❌ Failed to get embedding for index 4391
❌ Failed to get embedding for index 609
❌ Failed to get embedding for index 719
❌ Failed to get embedding for index 7589
❌ Failed to get embedding for index 705
❌ Failed to get embedding for index 5841
❌ Failed to get embedding for index 2060
❌ Failed to get embedding for index 7558
❌ Failed to get embedding for index 702
❌ Failed to get embedding for index 2146
❌ Failed to get embeddin

In [138]:
index.add(X)

In [139]:
print("Index type:", type(index))
print("Number of vectors:", index.ntotal)
print("Vector dimension:", index.d)


Index type: <class 'faiss.swigfaiss_avx2.IndexFlatL2'>
Number of vectors: 8807
Vector dimension: 4096


In [140]:
# Add to your index
faiss.write_index(index, 'index_file.idx')

In [141]:
index=faiss.read_index('index_file.idx')

In [142]:
fav_item=df.iloc[324]
res = requests.post('http://localhost:11434/api/embeddings',
    json={
        'model':'llama2',
        'prompt': fav_item['text_rep']
    }
)

In [143]:
embedding=np.array([res.json()['embedding']],dtype='float32')
D, I =index.search(embedding,5)

In [144]:
I

array([[ 324, 7649, 8596, 1380, 2583]], dtype=int64)

In [145]:
best_matches = np.array(df['text_rep'])[I.flatten()]

In [146]:
print(best_matches)

["\n    Type:TV Show,\n    Title:44 Cats,\n    Director:nan,\n    Description:Paw-esome tales abound when singing furry friends Lampo, Milady, Pilou and Meatball band together.,\n    Genres:Kids' TV,\n    Released:2020\n    "
 "\n    Type:TV Show,\n    Title:Ollie & Moon,\n    Director:nan,\n    Description:Two quirky cats, Ollie and Moon, and their friend Stanley the snail travel the world, solving everyday problems and learning about different cultures.,\n    Genres:Kids' TV,\n    Released:2018\n    "
 "\n    Type:TV Show,\n    Title:Tip the Mouse,\n    Director:nan,\n    Description:In his tiny meadow village, curious little mouse Tip enjoys fun and adventure while learning about the world with his furry friends and family.,\n    Genres:Kids' TV,\n    Released:2014\n    "
 "\n    Type:TV Show,\n    Title:Go Dog Go,\n    Director:nan,\n    Description:Handy and inventive pup Tag chases adventure with her best pal, Scooch, solving problems and helping the citizens of Pawston along the

In [147]:
for match in best_matches:
    print('NEXT MOVIE')
    print(match)
    print()

NEXT MOVIE

    Type:TV Show,
    Title:44 Cats,
    Director:nan,
    Description:Paw-esome tales abound when singing furry friends Lampo, Milady, Pilou and Meatball band together.,
    Genres:Kids' TV,
    Released:2020
    

NEXT MOVIE

    Type:TV Show,
    Title:Ollie & Moon,
    Director:nan,
    Description:Two quirky cats, Ollie and Moon, and their friend Stanley the snail travel the world, solving everyday problems and learning about different cultures.,
    Genres:Kids' TV,
    Released:2018
    

NEXT MOVIE

    Type:TV Show,
    Title:Tip the Mouse,
    Director:nan,
    Description:In his tiny meadow village, curious little mouse Tip enjoys fun and adventure while learning about the world with his furry friends and family.,
    Genres:Kids' TV,
    Released:2014
    

NEXT MOVIE

    Type:TV Show,
    Title:Go Dog Go,
    Director:nan,
    Description:Handy and inventive pup Tag chases adventure with her best pal, Scooch, solving problems and helping the citizens of Pawsto