## Expert Knowledge Worker

### A question answering agent that is an expert knowledge worker
### To be used by Anyone on their LinkedIn data
The easiest and fastest way to obtain a copy of your LinkedIn data is to initiate a data download from your Settings & Privacy page:

1. Click the  Me icon at the top of your LinkedIn homepage.
2. Select Settings & Privacy from the dropdown.
3. Click the Data Privacy on the left rail.
4 .Under the How LinkedIn uses your data section, click Get a copy of your data.
5. Select the data that you’re looking for and Request archive.

This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

In [17]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

MODEL = "gpt-4o-mini"
db_name = "linkedin_db"

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
# Read in documents using LangChain's loaders
# Put the chunks of data into a Vector Store (Chroma) that associates a Vector Embedding with each chunk

folders = glob.glob("linkedin-base/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)


print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")
print(f"Vectorstore created with {vectorstore._collection.count()} documents")
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
# 2D scatter plot

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red'][['connections', 'recommendations', 'profiles'].index(t)] for t in doc_types]

n = vectors.shape[0]
if n < 3:
    raise ValueError(f"t-SNE needs at least 3 samples, got {n}")

perp = max(5.0, min(30.0, (n - 1) / 3.0))  # always < n, within [5, 30]

tsne = TSNE(n_components=2, random_state=42, perplexity=perp)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# 3D scatter plot!

n = vectors.shape[0]
if n < 3:
    raise ValueError(f"t-SNE needs at least 3 samples, got {n}")

perp = max(5.0, min(30.0, (n - 1) / 3.0))

tsne = TSNE(n_components=3, random_state=42, perplexity=perp)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# The main Langchain Abstraction are:  Memory, LLM, and Retriever
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

with gr.Blocks(theme="gradio/monochrome") as ui:
    gr.Markdown(
        """
        <h2 style="color: #f5f5f5;">Linkedin Knowledge Worker</h2>
        <p style="color: #f5f5f5;">Chat with your auto-generated Linkedin knowledge base </p>
        """,
        elem_id="title"
    )
    gr.ChatInterface(chat, type="messages")

ui.launch(inbrowser=True)