In [2]:
## Data Ingestion (Read from files)
from langchain_community.document_loaders import TextLoader

# Read from text file
loader = TextLoader("speech.txt")
text_doc = loader.load()
text_doc

[Document(page_content='I have a dream that one day down in Alabama, with its vicious racists, with its governor having his lips dripping with the words of interposition and nullification – one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers.\n\nI have a dream today.\n\nI have a dream that one day every valley shall be exalted, and every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight, and the glory of the Lord shall be revealed and all flesh shall see it together.\n\nThis is our hope. This is the faith that I go back to the South with. With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to st

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [6]:
# Web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load, chunk, and index the content of the html page
loader = WebBaseLoader(web_path=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                       bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_=("post-title", "post-content", "post-header")
                       )),)

text_doc = loader.load()
text_doc


[Document(page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final re

In [7]:
# Read from PDF
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("2D3MF.pdf")

text_doc = loader.load()
text_doc

[Document(page_content='2D3MF - DEEPFAKE DETECTION USING MULTI-MODAL MIDDLE FUSION\nAdrian S. Roman, Aiden Chang, Hyunkeun Park, Kevin Hopkins, Shrutika Shrutika, Tom Yang\nViterbi School of Engineering, University of Southern California, California, USA\nABSTRACT\nDeepfake detection is the task of detecting videos that have\nbeen generated or manipulated using deep learning. Detect-\ning deepfakes is crucial to prevent the spread of misinfor-\nmation in audio-visual media. Recent advancements in the\nfield include joint learning of audio and visual information,\nby training independent modules and making a decision be-\ntween learned embeddings from both modalities. While pre-\nvious methods are robust when mainly the video content has\nbeen manipulated, they often face challenges when only the\naudio is manipulated. Our model, dubbed 2D3MF, proposes\na novel method that exploits the relationship between emo-\ntions conveyed in audio and video for multi-modal deepfake\ndetection. We b

In [13]:
## Data Transform (Make document into chunks)
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(text_doc)
docs[:3]


[Document(page_content='2D3MF - DEEPFAKE DETECTION USING MULTI-MODAL MIDDLE FUSION\nAdrian S. Roman, Aiden Chang, Hyunkeun Park, Kevin Hopkins, Shrutika Shrutika, Tom Yang\nViterbi School of Engineering, University of Southern California, California, USA\nABSTRACT\nDeepfake detection is the task of detecting videos that have\nbeen generated or manipulated using deep learning. Detect-\ning deepfakes is crucial to prevent the spread of misinfor-\nmation in audio-visual media. Recent advancements in the\nfield include joint learning of audio and visual information,\nby training independent modules and making a decision be-\ntween learned embeddings from both modalities. While pre-\nvious methods are robust when mainly the video content has\nbeen manipulated, they often face challenges when only the\naudio is manipulated. Our model, dubbed 2D3MF, proposes\na novel method that exploits the relationship between emo-\ntions conveyed in audio and video for multi-modal deepfake\ndetection. We b

In [24]:
## Vector Embedding and Vector Store
from langchain_community.embeddings import OpenAIEmbeddings # Embedding
from langchain_community.vectorstores import Chroma # Vector Store(Database) 

db = Chroma.from_documents(docs, OpenAIEmbeddings())

# Query from Chroma Vector DB
query = "What is 2D3MF model"
res = db.similarity_search(query)
res[0].page_content

'Fig. 2 . High-level overview of our 2D3MF with audio and video inputs fused using Self and Cross-Attention middle fusion via\ntransformer attention.\nand speaker identification. These networks extract useful fa-\ncial and speech representations, proving essential for the ef-\nfectiveness of DVD tasks.\nIn this work, we study the utility of audio-visual emo-\ntion speaker embeddings, representations extracted from pre-\ntrained audio and video networks, as robust features for the\nDVD task. To the best of our knowledge, we are the first\nto leverage abstract representations of emotions in the audio-\nvisual domain to highlight and detect inconsistencies in fake\nvideos. We propose 2D3MF (Deepfake Detection with Multi\nModal Middle Fusion), which is a novel middle fusion strat-\negy where audio and visual data are synergistically analyzed\nto capture discrepancies in emotional expressions, and vocal\ntones. These features reveal the subtle yet critical flaws inher-'

In [25]:
## Use FAISS Vector DB
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(docs, OpenAIEmbeddings())
query = "What is 2D3MF model"
res = db.similarity_search(query)
res[0].page_content

'Fig. 2 . High-level overview of our 2D3MF with audio and video inputs fused using Self and Cross-Attention middle fusion via\ntransformer attention.\nand speaker identification. These networks extract useful fa-\ncial and speech representations, proving essential for the ef-\nfectiveness of DVD tasks.\nIn this work, we study the utility of audio-visual emo-\ntion speaker embeddings, representations extracted from pre-\ntrained audio and video networks, as robust features for the\nDVD task. To the best of our knowledge, we are the first\nto leverage abstract representations of emotions in the audio-\nvisual domain to highlight and detect inconsistencies in fake\nvideos. We propose 2D3MF (Deepfake Detection with Multi\nModal Middle Fusion), which is a novel middle fusion strat-\negy where audio and visual data are synergistically analyzed\nto capture discrepancies in emotional expressions, and vocal\ntones. These features reveal the subtle yet critical flaws inher-'