## Preview the csv dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('NetflixOriginals.csv', encoding='cp1252')
df

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi
...,...,...,...,...,...,...
579,Taylor Swift: Reputation Stadium Tour,Concert Film,"December 31, 2018",125,8.4,English
580,Winter on Fire: Ukraine's Fight for Freedom,Documentary,"October 9, 2015",91,8.4,English/Ukranian/Russian
581,Springsteen on Broadway,One-man show,"December 16, 2018",153,8.5,English
582,Emicida: AmarElo - It's All For Yesterday,Documentary,"December 8, 2020",89,8.6,Portuguese


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       584 non-null    object 
 1   Genre       584 non-null    object 
 2   Premiere    584 non-null    object 
 3   Runtime     584 non-null    int64  
 4   IMDB Score  584 non-null    float64
 5   Language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


## Load Llama3-8B as LLM and embedding

In [9]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

In [2]:
MODEL = "llama3:latest"

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)

print(model.invoke("Tell me a joke"))

Here's one:

Why couldn't the bicycle stand up by itself?

(Wait for it...)

Because it was two-tired!

Hope that made you smile! Do you want to hear another one?


## Load csv file, set the prompt, and create the chain

In [3]:
from langchain_core.output_parsers import StrOutputParser

In [4]:
# This code useful for converting into a clean string format, but llama3 already does this
parser = StrOutputParser()

chain = model | parser
# print(chain.invoke("Tell me a joke"))

In [5]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [6]:
loader = CSVLoader(file_path="NetflixOriginals.csv")
data = loader.load()

print(data)

[Document(metadata={'source': 'NetflixOriginals.csv', 'row': 0}, page_content='Title: Enter the Anime\nGenre: Documentary\nPremiere: August 5, 2019\nRuntime: 58\nIMDB Score: 2.5\nLanguage: English/Japanese'), Document(metadata={'source': 'NetflixOriginals.csv', 'row': 1}, page_content='Title: Dark Forces\nGenre: Thriller\nPremiere: August 21, 2020\nRuntime: 81\nIMDB Score: 2.6\nLanguage: Spanish'), Document(metadata={'source': 'NetflixOriginals.csv', 'row': 2}, page_content='Title: The App\nGenre: Science fiction/Drama\nPremiere: December 26, 2019\nRuntime: 79\nIMDB Score: 2.6\nLanguage: Italian'), Document(metadata={'source': 'NetflixOriginals.csv', 'row': 3}, page_content='Title: The Open House\nGenre: Horror thriller\nPremiere: January 19, 2018\nRuntime: 94\nIMDB Score: 3.2\nLanguage: English'), Document(metadata={'source': 'NetflixOriginals.csv', 'row': 4}, page_content='Title: Kaali Khuhi\nGenre: Mystery\nPremiere: October 30, 2020\nRuntime: 90\nIMDB Score: 3.4\nLanguage: Hindi'),

In [7]:
from langchain.prompts import PromptTemplate

In [10]:
template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="The capital of Indonesia is Nusantara.", question="What is the capital of Indonesia?"))


Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: The capital of Indonesia is Nusantara.

Question: What is the capital of Indonesia?



In [17]:
chain = prompt | model | parser

In [13]:
chain.invoke(
    {
        "context": "The name I was given was Fahri",
        "question": "What's my name?"
    }
)

'Your name is Fahri.'

## Create vector database and set as a retriever

In [3]:
from langchain_community.vectorstores import FAISS

In [None]:
db = FAISS.from_documents(
    data,
    embedding=embeddings
)

In [17]:
# Uncomment this code if you want to save and/or load a FAISS index

# db.save_local("faiss_llama3_v1_netflix")
# new_db = FAISS.load_local("faiss_llama3_v1_netflix", embeddings, allow_dangerous_deserialization=True)

In [13]:
# Set k to 1 because we only using one document
retriever = db.as_retriever(search_kwargs={"k": 1})
# retriever.invoke("David Attenborough: A Life on Our Planet")

## Ask the question

In [18]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [19]:
# Store the movie title
recent_movie = "ReMastered: The Lion's Share"

for s in chain.stream({"question": "A user recently watched movie '{recent_movie}'. Based on the watch history, please recommend five candidate movies that the user might be interested in!"}):
    print(s, end="", flush=True)

I don't know