In [24]:
# %pip install langchain langchain_huggingface faiss-cpu wikipedia datasets langchain_community

In [None]:
import json
import pandas as pd

from langchain.document_loaders import CSVLoader, JSONLoader, WikipediaLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

from secrets import hf_token

In [106]:
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=512,
    do_sample=True,
    repetition_penalty=1.1,
    huggingfacehub_api_token=hf_token,
    device_map='auto'
)

llm.invoke('What is the name of the president of the united state? answer short and concise')

': Joe Biden (as of April 2023).'

In [39]:
# Load Wikipedia article
topic = "Machine learning"
loader = WikipediaLoader(query=topic, lang="en")
docs = loader.load()

# Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(docs)

print(f"Loaded {len(chunks)} text chunks from Wikipedia.")

Loaded 306 text chunks from Wikipedia.


In [None]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

In [40]:
vector_store = FAISS.from_documents(chunks, embeddings)

vector_store.save_local("faiss_index")
print("FAISS index saved.")

FAISS index saved.


In [42]:
# Reload FAISS index (allowing deserialization)
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [43]:
query = "What are the applications of machine learning?"
retrieved_docs = vector_store.similarity_search(query, k=3)


--- Retrieved Chunk 1 ---

Machine learning employs various techniques, including supervised, unsupervised, and reinforcement learning, to enable systems to learn from data and make predictions or classifications without being explicitly programmed with the models they aim to apply. Machine learning has gained widespread success and is now a fundamental component of numerous applications, including image recognition, natural language processing, autonomous systems, and predictive analytics. As a branch of computer

--- Retrieved Chunk 2 ---

process of applying machine learning end-to-end additionally offers the advantages of producing simpler solutions, faster creation of those solutions, and models that often outperform hand-designed models.

--- Retrieved Chunk 3 ---

and predictive analytics. As a branch of computer science, it focuses on the development of algorithms that allow computers to identify patterns and understand data, mimicking certain aspects of human cognitive abilit

In [None]:
# Print retrieved chunks
for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Retrieved Chunk {i+1} ---\n")
    print(doc.page_content)

In [None]:
# Format retrieved chunks as context
context = "\n".join([doc.page_content for doc in retrieved_docs])

In [44]:
final_prompt = f"Use the following context to answer:\n\n{context}\n\nQ: {query}\nA:"

response = llm.invoke(final_prompt)
print("\n=== LLM Answer ===\n")
print(response)


=== AI Answer ===

 Image recognition, natural language processing, autonomous systems, predictive analytics.


# MY Own Data

In [54]:
DATASET_PATH = '../input/premier-league/premier_league.csv'

In [55]:
df = pd.read_csv(DATASET_PATH)

In [None]:
df.head()

In [51]:
loader = CSVLoader("../input/premier-league/premier_league.csv")
docs = loader.load()

print(f"Loaded {len(docs)} documents.")

Loaded 208 documents.


In [59]:
# Split into 500-character chunks with 50-character overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(docs)

print(f"Total chunks: {len(chunks)}")

Total chunks: 624


In [60]:
# Use a Sentence Transformer model to generate embeddings
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

In [61]:
vector_store = FAISS.from_documents(chunks, embeddings)

vector_store.save_local("faiss_soccer")
print("FAISS index saved.")

FAISS index saved.


In [91]:
vector_store = FAISS.load_local("faiss_soccer", embeddings, allow_dangerous_deserialization=True)

In [None]:
query = "who did Norwich play against as AwayTeam?"
retrieved_docs = vector_store.similarity_search(query, k=5)

for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Retrieved Chunk {i+1} ---\n")
    print(doc.page_content)

In [92]:
final_prompt = f"Use the following sports-related context to answer the question:\n\n{context}\n\nQ: {query}\nA:"
response = llm.invoke(final_prompt)

print("\n=== LLM Answer ===\n")
print(response)


=== AI Answer ===

 The context does not provide information about any specific match involving Norwich City Football Club as AwayTeam.


# Try Again

In [94]:
[
  {
    "year": 2022,
    "event": "FIFA World Cup",
    "location": "Qatar",
    "winner": "Argentina",
    "runner_up": "France",
    "score": "3-3 (4-2 Penalties)",
    "key_players": ["Lionel Messi", "Kylian Mbappé", "Emiliano Martínez"],
    "key_moments": [
      "Messi won his first World Cup title.",
      "Mbappé scored a historic hat-trick in the final.",
      "Argentina won in penalties with Emiliano Martínez making crucial saves."
    ]
  },
  {
    "year": 2018,
    "event": "FIFA World Cup",
    "location": "Russia",
    "winner": "France",
    "runner_up": "Croatia",
    "score": "4-2",
    "key_players": ["Kylian Mbappé", "Paul Pogba", "Antoine Griezmann"],
    "key_moments": [
      "France won its second World Cup title.",
      "Mbappé became the second teenager (after Pelé) to score in a World Cup final.",
      "France dominated with a strong midfield performance."
    ]
  },
  {
    "year": 2014,
    "event": "FIFA World Cup",
    "location": "Brazil",
    "winner": "Germany",
    "runner_up": "Argentina",
    "score": "1-0 (AET)",
    "key_players": ["Mario Götze", "Manuel Neuer", "Lionel Messi"],
    "key_moments": [
      "Mario Götze scored the winning goal in extra time.",
      "Germany became the first European team to win a World Cup in South America.",
      "Messi won the Golden Ball as the tournament's best player."
    ]
  }
]


[{'year': 2022,
  'event': 'FIFA World Cup',
  'location': 'Qatar',
  'winner': 'Argentina',
  'runner_up': 'France',
  'score': '3-3 (4-2 Penalties)',
  'key_players': ['Lionel Messi', 'Kylian Mbappé', 'Emiliano Martínez'],
  'key_moments': ['Messi won his first World Cup title.',
   'Mbappé scored a historic hat-trick in the final.',
   'Argentina won in penalties with Emiliano Martínez making crucial saves.']},
 {'year': 2018,
  'event': 'FIFA World Cup',
  'location': 'Russia',
  'winner': 'France',
  'runner_up': 'Croatia',
  'score': '4-2',
  'key_players': ['Kylian Mbappé', 'Paul Pogba', 'Antoine Griezmann'],
  'key_moments': ['France won its second World Cup title.',
   'Mbappé became the second teenager (after Pelé) to score in a World Cup final.',
   'France dominated with a strong midfield performance.']},
 {'year': 2014,
  'event': 'FIFA World Cup',
  'location': 'Brazil',
  'winner': 'Germany',
  'runner_up': 'Argentina',
  'score': '1-0 (AET)',
  'key_players': ['Mario Gö

In [122]:
world_cup_data = [
    {
        "year": 2022,
        "event": "FIFA World Cup",
        "location": "Qatar",
        "winner": "Argentina",
        "runner_up": "France",
        "score": "3-3 (4-2 Penalties)",
        "key_players": ["Lionel Messi", "Kylian Mbappé", "Emiliano Martínez"],
        "key_moments": [
            "Messi won his first World Cup title.",
            "Mbappé scored a historic hat-trick in the final.",
            "Argentina won in penalties with Emiliano Martínez making crucial saves."
        ]
    },
    {
        "year": 2018,
        "event": "FIFA World Cup",
        "location": "Russia",
        "winner": "WopaLand",
        "runner_up": "Croatia",
        "score": "4-2",
        "key_players": ["Kylian Mbappé", "Paul Pogba", "Antoine Griezmann"],
        "key_moments": [
            "WopaLand won its second World Cup title.",
            "Mbappé became the second teenager (after Pelé) to score in a World Cup final.",
            "WopaLand dominated with a strong midfield performance."
        ]
    },
    {
        "year": 2014,
        "event": "FIFA World Cup",
        "location": "Brazil",
        "winner": "Germany",
        "runner_up": "Argentina",
        "score": "1-0 (AET)",
        "key_players": ["Mario Götze", "Manuel Neuer", "Lionel Messi"],
        "key_moments": [
            "Mario Götze scored the winning goal in extra time.",
            "Germany became the first European team to win a World Cup in South America.",
            "Messi won the Golden Ball as the tournament's best player."
        ]
    }
]

In [124]:
json_file = "world_cup.json"
with open(json_file, "w") as f:
    json.dump(world_cup_data, f)

In [None]:
with open("world_cup.json", "r") as f:
    world_cup_data = json.load(f)  # This is a list of dictionaries

In [None]:
# 2️⃣ Convert Each Dictionary to a LangChain `Document`
docs = []
for entry in world_cup_data:
    text = f"""
    {entry['year']} FIFA World Cup:
    Winner: {entry['winner']} | Runner-up: {entry['runner_up']} | Score: {entry['score']}
    Location: {entry['location']}
    Key Players: {', '.join(entry['key_players'])}
    Key Moments: {', '.join(entry['key_moments'])}
    """
    docs.append(Document(page_content=text.strip(), metadata={"year": entry["year"], "location": entry["location"]}))

print(f"Loaded {len(docs)} structured documents.")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(docs)

In [None]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

In [125]:
vector_store = FAISS.from_documents(chunks, embeddings)
vector_store.save_local("faiss_world_cup")
print("FAISS index saved.")

Loaded 3 structured documents.
FAISS index saved.


In [126]:
docs

[Document(metadata={'year': 2022, 'location': 'Qatar'}, page_content='2022 FIFA World Cup:\n    Winner: Argentina | Runner-up: France | Score: 3-3 (4-2 Penalties)\n    Location: Qatar\n    Key Players: Lionel Messi, Kylian Mbappé, Emiliano Martínez\n    Key Moments: Messi won his first World Cup title., Mbappé scored a historic hat-trick in the final., Argentina won in penalties with Emiliano Martínez making crucial saves.'),
 Document(metadata={'year': 2018, 'location': 'Russia'}, page_content='2018 FIFA World Cup:\n    Winner: WopaLand | Runner-up: Croatia | Score: 4-2\n    Location: Russia\n    Key Players: Kylian Mbappé, Paul Pogba, Antoine Griezmann\n    Key Moments: WopaLand won its second World Cup title., Mbappé became the second teenager (after Pelé) to score in a World Cup final., WopaLand dominated with a strong midfield performance.'),
 Document(metadata={'year': 2014, 'location': 'Brazil'}, page_content="2014 FIFA World Cup:\n    Winner: Germany | Runner-up: Argentina | Sc

In [None]:
vector_store = FAISS.load_local("faiss_world_cup", embeddings, allow_dangerous_deserialization=True)

In [127]:

query = "Who won the FIFA World Cup in 2018?"
retrieved_docs = vector_store.similarity_search(query, k=3)

context = "\n".join([doc.page_content for doc in retrieved_docs])


=== AI Answer ===

 WopaLand


In [None]:
final_prompt = f"Use the following World Cup context to answer the question:\n\n{context}\n\nQ: {query}\nA:"
response = llm.invoke(final_prompt)

print("\n=== LLM Answer ===\n")
print(response)