In [7]:
import pandas as pd 
import os 
# from openai.embeddings_utils import get_embedding
from openai import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
import podcast_scraper
from pinecone import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from langchain_pinecone import PineconeVectorStore

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

from langchain.chat_models import ChatOpenAI

#### Building the recommendation system using podcast data 
After scraping podcast data from Chartable, we are going to start building the recommendation system enhanced with Langchain, and OpenAI 
We will be using the dataset scraped with title, description and genres

**Data Preprocessing** 

I first merge the text column into combined information including title of the podcast, overview and genres 


In [2]:
podcast = podcast_scraper.load_podcast_json_to_dataframe('../data/podcast_data.json')
podcast['combined'] = podcast.apply(lambda row: f"Title: {row['podcast']}. Overview: {row['description']} Genres: {row['type']}", axis=1)



**Embedding** 

We're going to converting the texts into numerical representations referring to as embedding. Generally speaking, embedding helps perform tasks like retrieval which we will go through in the following code by converting unstructured texts into numerical representation. I'm using the text-embedding-3-small model which is a new model introduced by OpenAI early this year. For more information, you can check out [here](https://openai.com/index/new-embedding-models-and-api-updates/) 

We're going to store the vectors into a column called embedding

In [4]:
os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"

client = OpenAI()



def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

podcast['embedding'] = podcast.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))


**Splitting**

To fit into our model's context window, we would need to split the long podcast infromation into smaller meaningful chunks. There're different types of text splitters offered by Langchain. For more information, you can check out [here](https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/). For this exercise, we will be using `RecursiveCharacterTextSplitter`. The advantage of this splitter is that it will try to keep all paragraphs together as long as possible to maintain strong context semantically. 

To access Langchain's text splitter, you would need to create a API key in their [website](https://www.langchain.com/)


**Storing**
Next we're going to store the splitted documents into a vector store. We will use Pinecone as vectorstore with OpenAI embeddings. First we will need to create an index in Pinecone [console](https://docs.pinecone.io/guides/indexes/create-an-index#create-a-serverless-index). 
Then we will use `from_documents` method to accepts the class objects created using Langchain's `RecursiveCharacterTextSplitter` class. 


In [30]:
# Use text splitter to break down text on tokens and new lines 

embeddings = OpenAIEmbeddings()

os.environ["PINECONE_API_KEY"] = "YOUR-PINECONE-API-KEY"

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

index_name = pc.Index('your-index-name')

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
    add_start_index = True)

documents = [Document(page_content=text) for text in podcast['combined'].tolist()]
texts = text_splitter.split_documents(documents)


vectorstore_from_docs = PineconeVectorStore.from_documents(
    texts,
    index_name='your-index-name',
    embedding=embeddings
)



**Retrieval**

In [32]:
retriever = vectorstore_from_docs.as_retriever()


In [43]:

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages= True)
chain = ConversationalRetrievalChain.from_llm(llm, retriever= retriever, memory= memory)


In [42]:
query = 'If I want to know what book i should read, which podcast would you recommend?'
chain.run({'question': query})



'I would recommend "What Should I Read Next?" podcast for finding book recommendations. Anne Bogel interviews readers about the books they love, hate, and are currently reading, then makes recommendations on what to read next. It\'s specifically designed to help you find your next read.'

#### Prompt engineering 
Pass custom prompt with information about users 


In [None]:
template = """
You're a podcast recommender system that helps users to find the right podcast that match their interest. 
Use the following pieces of context to answer the question at the end.
For each question, use the context and input provided by the user

{context}
"""
user_info = """ The following are user input provided by the users 

Age: {age}
Gender: {gender}
Profession: {profession}
Topics of interest: {interest}

"""

template_suffix = """"""