In [5]:
import openai
import os
from  dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

True

In [61]:
#os.environ['OPENAI_API_KEY']=os.getenv('OPEN_API_KEY')
openai.key= os.getenv('OPENAI_API_KEY')

In [62]:
%load_ext jupyter_ai_magics

The jupyter_ai_magics extension is already loaded. To reload it, use:
  %reload_ext jupyter_ai_magics


In [46]:
#### Document loader
def load_document(file):
    import os
    #refactor to a factory
    name,extension = os.path.splitext(file)
    print(f'loading {file}')
    if extension=='.pdf':
        from langchain.document_loaders import PyPDFLoader
        loader = PyPDFLoader(file)
    elif extension=='.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)
    else:
        print('Document format not supported')    
    data=loader.load()
    return data
    

In [59]:
####Wikipedia loader
def load_from_wikipedia(query,lang='en',load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query,lang=lang,load_max_docs=load_max_docs)
    data=loader.load()
    return data

In [75]:
####Split document into chunks of specified size
def chunk_data(data,chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

In [76]:
def print_embedding_cost(texts):
    import tiktoken
    encoding = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(encoding.encode(page.page_content))
                       for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000*0.0004:.6f}')

In [90]:
####Embed data to pinecone.returns index if already embedded
def insert_or_get_embeddings(index_name,chunks):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_KEY'),environment=os.environ.get('PINECONE_ENV'))
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    embedding= OpenAIEmbeddings()
    if index_name in pinecone.list_indexes():
        vector_store = Pinecone.from_existing_index(index_name=index_name,embedding=embedding)
        print(f'{index_name} already exists. Loading embeddings')
    else:
        print(f'Creating index: {index_name} and embeddings',end='')
        pinecone.create_index(index_name,dimension=1536,metric='cosine')
        vector_store=Pinecone.from_documents(chunks,embedding=embedding,index_name=index_name)
        print('OK')
    return vector_store


In [80]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_KEY'),environment=os.environ.get('PINECONE_ENV'))
    if(index_name=='all'):
        indexes=pinecone.list_indexes()
        print('Deleting all indexes.')
        for index in indexes:
            pinecone.delete_index(index)
        print('OK')
        return
    print(f'Deleting index:{index_name}')
    pinecone.delete_index(index)
    print('OK')
    
        

In [101]:
data = load_document('files/2021_1656420953953.pdf')

loading files/2021_1656420953953.pdf


In [102]:
chunked_data = chunk_data(data)

In [103]:
print_embedding_cost(chunked_data)

Total tokens: 181847
Embedding Cost in USD: 0.072739


In [104]:
delete_pinecone_index()

Deleting all indexes.
OK


In [105]:
index_name = 'pvranalytics'
vector_store = insert_or_get_embeddings(index_name=index_name,chunks=chunked_data)

Creating index: pvranalytics and embeddingsOK


In [94]:
def prompt_vector(question,vector_store):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI
    gpt = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)
    vector_retriever = vector_store.as_retriever(search_type='similarity',search_kwargs={'k':3})
    chain = RetrievalQA.from_chain_type(llm=gpt,chain_type="stuff",retriever=vector_retriever)
    answer = chain.run(question)
    return answer;

In [107]:
def prompt_vector_and_persist(question,vector_store,history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    gpt = ChatOpenAI(model="gpt-3.5-turbo",temperature=1)
    vector_retriever = vector_store.as_retriever(search_type='similarity',search_kwargs={'k':3})
    chain = ConversationalRetrievalChain.from_llm(llm=gpt,retriever=vector_retriever)
    result = chain({'question':question,'chat_history':history})
    history.append((question,result['answer']))
    return result,history;

In [106]:
import time
i=1
print('Write quite or Exit to quit.')
while True:
    q=input(f'Queation {i}:')
    i=i+1
    if q.lower() in ['quit','exit']:
        print('Exitng')
        time.sleep(2)
        break
    answer = prompt_vector(q,vector_store)
    print(f'Answer: {answer}')
    print(f'\n{"-"*50}\n')
    

Write quite or Exit to quit.


Queation 1: What are the main revenue streams of pvr


Answer: The main revenue streams of PVR include box office revenues, food and beverage sales, advertisement revenue, convenience fees, income from movie production/distribution, and interest received.

--------------------------------------------------



Queation 2: can i get the revenue streams from each sources of revenue in a json format


Answer: I'm sorry, but I don't have access to the specific revenue streams from each source of revenue in a JSON format.

--------------------------------------------------



Queation 3: can i get the revenue streams from each sources of revenue streams


Answer: I'm sorry, but I don't have access to specific information about the revenue streams from each source.

--------------------------------------------------



Queation 4: can i get the revenue streams from each sources of icome


Answer: I'm sorry, but I don't have access to specific financial information for the company. It would be best to refer to the company's financial statements or annual reports for detailed information on the revenue streams from each source of income.

--------------------------------------------------



Queation 5: are there financial data in the uploaded document


Answer: Based on the given context, it is mentioned that the document includes standalone financial statements and provides required information and documents. Therefore, it is likely that there are financial data in the uploaded document.

--------------------------------------------------



Queation 6: can you extract the key strenghts of the company


Answer: Based on the given context, the key strengths of the company can be identified as follows:

1. Combined balance sheet strength: The company has a strong financial position, which can be leveraged to expand into Tier 2 and Tier 3 markets.

2. Cost and revenue synergies: The company has the potential to realize significant cost savings and revenue growth by combining its resources and operations.

3. Diversified stakeholders: The company has a wide range of stakeholders, including developers, content producers, technology service providers, the state exchequer, and employees. This diverse network of stakeholders can provide various benefits and opportunities for the company.

4. Fair value assessment: The company uses a discounted cash flow approach to determine the fair values of its businesses. This indicates a reliable and thorough evaluation of its assets and investments.

5. Focus on efficiency: The company collaborates with third parties to identify inefficiencies and impro

Queation 7: what are the future plans of the company


Answer: I'm sorry, but I don't have access to specific information about the future plans of the company. It would be best to reach out to the company directly for more information on their future plans.

--------------------------------------------------



Queation 8: from the standalone financials in the documents please extract the key ratios


Answer: I'm sorry, but I cannot access or extract specific information from documents. I can only provide general information and answer questions based on the context you provide.

--------------------------------------------------



Queation 9: Is the company financially sound


Answer: Based on the given context, it can be inferred that the company has adequate internal financial controls in place for both standalone and consolidated financial statements. However, the information provided does not directly indicate whether the company is financially sound. Additional information would be needed to determine the financial soundness of the company, such as its profitability, liquidity, and solvency.

--------------------------------------------------



Queation 10: quit


Exitng


In [108]:
chat_history=[]
question = 'What is the document'
result,chat_history=prompt_vector_and_persist(question,vector_store,chat_history)


In [110]:
chat_history


[('What is the document',
  'The document referred to is not specified in the provided context.')]