In [18]:
import os
import re
import openai
import pandas as pd
import pinecone
from typing import Dict, List, Any, Tuple
from dotenv import load_dotenv
from pathlib import Path
from transformers import GPT2TokenizerFast

## Loading Text Data


In [2]:
# Initialize a list of all the text files in the data folder
text_data: List[Dict[str, str]] = []

for filename in os.listdir('./data'):

    # Read and store the text of each "txt" file in the text_data list
    if filename.endswith('.txt'):
        with open(os.path.join('data', filename), 'r') as f:
            text = f.read()
            text_data.append({'filename': filename, 'text': text})

# Convert the text_data list into a DataFrame
df = pd.DataFrame(text_data)
df

Unnamed: 0,filename,text
0,article.txt,Do you know the difference between $1 million ...
1,interview.txt,Many entrepreneurs start a business with grand...
2,summary.txt,Section I: How We Got Here\nIn Section I of $1...


## Utility Functions


In [3]:
def divide_chunks(lst: List[Any], n: int):
    """
    Divides a list into chunks of size n. 
    """
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def count_tokens(text: str, tokenizer: GPT2TokenizerFast) -> int:
    """
    Returns the number of tokens in the text.
    """
    return len(tokenizer.encode(text))

## Tokenizing and Chunking


In [4]:
# Initialize a tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

# Tokenize the text and count the number of tokens in each file
print(df['text'].apply(lambda x: count_tokens(x, tokenizer)))

# Go through each row of the Dataframe and tokenize the text. Split it into
# chunks of 512 tokens and store the chunks in a list
tokenized_chunks = []

for row in df.itertuples():

    # Tokenize the text
    tokenized_text: List[int] = tokenizer.encode(row.text)
    text_words: List[str] = re.findall(r'\w+', row.text)

    # Get words/token ratio
    num_tokens = len(tokenized_text)
    num_words = len(text_words)
    words_token_ratio = num_words / num_tokens

    # We want 512 tokens per chunk, so we get how many words is that by multiplying the
    # number of tokens by the words/token ratio. We add 15% to the number of words, because
    # the words/token ratio underestimates the number of words in the text for a set number
    # of tokens.
    words_per_chunk = int(512 * words_token_ratio * 1.15)

    # Split the tokenized text into chunks of 512 approximately 512 tokens
    chunks = list(divide_chunks(text_words, words_per_chunk))

    # Join the words in each chunk into a string
    for chunk in chunks:
        chunk_sentence = ' '.join(chunk)
        tokenized_chunk = tokenizer.encode(
            chunk_sentence,
            max_length=512,
            truncation=True,
        )

        tokenized_chunks.append({
            'filename': row.filename,
            'text': chunk_sentence,
            'num_tokens': len(tokenized_chunk),
            'tokens': tokenized_chunk,
        })

token_data = pd.DataFrame(tokenized_chunks)
token_data

Token indices sequence length is longer than the specified maximum sequence length for this model (3236 > 1024). Running this sequence through the model will result in indexing errors


0    3236
1    1409
2    5460
Name: text, dtype: int64


Unnamed: 0,filename,text,num_tokens,tokens
0,article.txt,Do you know the difference between 1 million a...,503,"[5211, 345, 760, 262, 3580, 1022, 352, 1510, 2..."
1,article.txt,was two to four years of management consulting...,498,"[9776, 734, 284, 1440, 812, 286, 4542, 18158, ..."
2,article.txt,business got complicated and after a while the...,512,"[22680, 1392, 8253, 290, 706, 257, 981, 262, 5..."
3,article.txt,when he started his first business No But was ...,496,"[12518, 339, 2067, 465, 717, 1597, 1400, 887, ..."
4,article.txt,project or career path before they ve given it...,487,"[16302, 393, 3451, 3108, 878, 484, 1569, 1813,..."
5,article.txt,skills that make the duo better together compa...,273,"[8135, 2171, 326, 787, 262, 18545, 1365, 1978,..."
6,interview.txt,Many entrepreneurs start a business with grand...,512,"[7085, 17038, 923, 257, 1597, 351, 4490, 3352,..."
7,interview.txt,premium and over delivering to satisfy the cli...,512,"[31605, 1505, 290, 625, 13630, 284, 15959, 262..."
8,interview.txt,the name of a promotion to something memorable...,204,"[1169, 1438, 286, 257, 12148, 284, 1223, 18078..."
9,summary.txt,Section I How We Got Here In Section I of 100M...,475,"[16375, 314, 1374, 775, 11853, 3423, 554, 7275..."


## Loading Dotenv


In [None]:
# Load the .env file with the OpenAI API key
# Note: Dont forget to have a .env file in "ansible/storage" with the OPENAI_API_KEY
load_dotenv(dotenv_path=Path('../ansible/storage/.env'))

## Embedding


In [20]:
# Set up the OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize a list to store the embeddings
embeddings: List[Tuple[str, List[float], Dict[str, Any]]] = []

# Go through each row of the token_data DataFrame and create an embedding for each chunk
for row in token_data.itertuples():

    # Create the embedding. This will generate a vector of 1536 dimensions for each chunk
    response = openai.Embedding.create(
        input=row.text,
        model="text-embedding-ada-002",
    )

    # Fetch the embedding vector
    vector = response['data'][0]['embedding']

    # Store the embedding
    embeddings.append((
        f"vector_{row.filename}_{row.Index}",
        vector,
        {'filename': row.filename, 'text': row.text}
    ))

## Adding Data to Pinecone


In [23]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')

# Raise an error if the PINECONE_API_KEY or PINECONE_ENVIRONMENT are not set
if PINECONE_API_KEY is None or PINECONE_ENVIRONMENT is None:
    raise ValueError(
        'Please set the PINECONE_API_KEY and PINECONE_ENVIRONMENT environment variables'
    )

# Get the dimensions of one of the embeddings
embedding_dimensions = len(embeddings[0][1])

# Initialize the Pinecone client
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT,
)

# Check if a "hormozi-gpt" index exists. If not, create it.
index_name = 'hormozi-gpt'
active_indexes = pinecone.list_indexes()
if index_name not in active_indexes:
    pinecone.create_index(name=index_name, dimension=embedding_dimensions)

# Describe the index
description = pinecone.describe_index(index_name)

# Check that the index has the correct dimensions
if description.dimension != embedding_dimensions:
    raise ValueError(
        f"The index {index_name} has dimension {description.dimension}, but the embeddings have dimension {embedding_dimensions}."
    )

# Insert the embeddings into the index
index = pinecone.Index(index_name=index_name)
upsert_response = index.upsert(
    vectors=embeddings,
)

## Querying Pinecone

These are not necessary, simply a test to check how to query Pinecone.


In [24]:
# Embed a test sentence
response = openai.Embedding.create(
    input="Write me a business plan",
    model="text-embedding-ada-002",
)

test_embedding = response['data'][0]['embedding']

In [26]:
index = pinecone.Index(index_name=index_name)
query_response = index.query(
    top_k=3,
    vector=test_embedding,
    include_metadata=True,
)

In [27]:
query_response

{'matches': [{'id': 'vector_interview.txt_6',
              'metadata': {'filename': 'interview.txt',
                           'text': 'Many entrepreneurs start a business with '
                                   'grand plans of reaping the rewards of '
                                   'their hard work and becoming a success '
                                   'Some make it and some donâ t Those that do '
                                   'have likely followed a structure a '
                                   'blueprint a method whether intentionally '
                                   'or otherwise Growing a successful and '
                                   'sustainable business is a formula that '
                                   'anyone can follow Following a tried and '
                                   'tested method means less trial and error '
                                   'and fewer mistakes Every business mistake '
                                   'has alr