**Libraries Required**


In [9]:
import re
import os
import pandas as pd
import csv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from tqdm.auto import tqdm

**Read the document text file**

In [11]:
with open('../Reformatted_Wine_Descriptions_Enhanced NEW.txt', 'r') as file:
    lines = file.readlines() #split file into lines


**Extract wine name**

In [4]:
# Initialize a dictionary to store wine data
wine_data = {}

# Extract the names of the wines and other relevant data
for line in lines:
    if "is priced" in line:
        wine_name = line.split("is priced")[0].strip()
        wine_name = ''.join([i for i in wine_name if not i.isdigit()]) # Remove digits
        wine_name = wine_name.strip().rstrip(' mL').rstrip(' L') # Remove mL and L only from the end
        
        # Extract other relevant data
        data = line.split("is priced")[1].strip()
        
        # Check if the wine name already exists in the dictionary
        if wine_name in wine_data:
            wine_data[wine_name].append(data)
        else:
            wine_data[wine_name] = wine_name + ": " + data  # Include wine name in other data

# Write the wine names and other data into a CSV file
with open('wine_data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Wine Name', 'Other Data'])
    for name, data in wine_data.items():
        for item in data:
            writer.writerow([name, item])


In [12]:
df= pd.read_csv("../wine_data.csv")
df

Unnamed: 0,Wine Name,Other Data
0,Moorakyne Coonawarra Cabernet Sauvignon,Moorakyne Coonawarra Cabernet Sauvignon: at $2...
1,Riddoch Coonawarra Cabernet Sauvignon,Riddoch Coonawarra Cabernet Sauvignon: at $15 ...
2,Riddoch Coonawarra Cabernet Sauvignon,at $10 per bottle. The grape varietal is Caber...
3,Chapel Hill The Parson Cabernet Sauvignon,Chapel Hill The Parson Cabernet Sauvignon: at ...
4,Krondorf Winemakers Cabernet Sauvignon,Krondorf Winemakers Cabernet Sauvignon: at $20...
...,...,...
487,Yellowglen White,Yellowglen White: at $10 per bottle. This prod...
488,Brown Brothers Zibibbo,Brown Brothers Zibibbo: at $850 per bottle. Th...
489,Yellow Tail Sparkling Cocktails Mimosa Blood O...,Yellow Tail Sparkling Cocktails Mimosa Blood O...
490,Jacob's Creek Sparkling Moscato,Jacob's Creek Sparkling Moscato: at $940 per b...


**Groupby wine data under Unique wine name**

In [13]:
# Define a custom aggregation function to concatenate text
def combine_text(data):
    return ', '.join(data)

# Group by wine name and aggregate the "Other Data" column
grouped_df = df.groupby('Wine Name')['Other Data'].agg(combine_text).reset_index()

# Print the grouped DataFrame
grouped_df


Unnamed: 0,Wine Name,Other Data
0,Altos R Crianza Rioja,"Altos R Crianza Rioja: at $1,470 per bottle. T..."
1,Amelia & Trent Burge Versus Cuvee Brut,Amelia & Trent Burge Versus Cuvee Brut: at $20...
2,Angullong Sparkling Rosé,Angullong Sparkling Rosé: at $23 per bottle. T...
3,Arrogant Frog Rosé,Arrogant Frog Rosé: at $13 per bottle. This pr...
4,Artemis Pinot Noir,Artemis Pinot Noir: at $25 per bottle. This pr...
...,...,...
478,Yering Station Little Yering Pinot Noir,Yering Station Little Yering Pinot Noir: at $2...
479,Zonin Friuli Pinot Grigio,Zonin Friuli Pinot Grigio: at $17 per bottle. ...
480,Zonin Ventiterre Moscato,Zonin Ventiterre Moscato: at $16 per bottle. T...
481,Zonzo Estate Moscato,Zonzo Estate Moscato: at $25 per bottle. This ...


**Chunk Creation**

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
chunks_df = pd.DataFrame()
for product in grouped_df['Wine Name'].values:
    new_df = pd.DataFrame()
    data = grouped_df[grouped_df['Wine Name'] == product]['Other Data'].str.cat(sep=' ')

    data = data.replace("This",product).replace("The",product)
    
   
    chunks = text_splitter.split_text(data)
    new_df = pd.DataFrame({
        'Product Name': [product] * len(chunks),
        'Chunkid': range(len(chunks)),
        'Chunks': chunks,
       
    })

    chunks_df = pd.concat([chunks_df, new_df])

In [12]:
chunks_df

Unnamed: 0,Product Name,Chunkid,Chunks
0,Altos R Crianza Rioja,0,"Altos R Crianza Rioja: at $1,470 per bottle. A..."
1,Altos R Crianza Rioja,1,Rioja product number is 762176. Altos R Crianz...
0,Amelia & Trent Burge Versus Cuvee Brut,0,Amelia & Trent Burge Versus Cuvee Brut: at $20...
1,Amelia & Trent Burge Versus Cuvee Brut,1,& Trent Burge Versus Cuvee Brut bottle closure...
0,Angullong Sparkling Rosé,0,Angullong Sparkling Rosé: at $23 per bottle. A...
...,...,...,...
1,Zonin Ventiterre Moscato,1,"review titled ""Enjoyable"" says: ""bought this t..."
0,Zonzo Estate Moscato,0,Zonzo Estate Moscato: at $25 per bottle. Zonzo...
1,Zonzo Estate Moscato,1,Zonzo Estate Moscato product number is 65789. ...
0,d'Arenberg The Olive Grove Chardonnay,0,d'Arenberg d'Arenberg The Olive Grove Chardonn...


**Combine Product name with Chunkid**

In [13]:
chunks_df['Chunkid'] = chunks_df['Chunkid'].astype("str")
chunks_df['id'] = chunks_df['Product Name'] + "_" + chunks_df['Chunkid']
chunks_df.head()

Unnamed: 0,Product Name,Chunkid,Chunks,id
0,Altos R Crianza Rioja,0,"Altos R Crianza Rioja: at $1,470 per bottle. A...",Altos R Crianza Rioja_0
1,Altos R Crianza Rioja,1,Rioja product number is 762176. Altos R Crianz...,Altos R Crianza Rioja_1
0,Amelia & Trent Burge Versus Cuvee Brut,0,Amelia & Trent Burge Versus Cuvee Brut: at $20...,Amelia & Trent Burge Versus Cuvee Brut_0
1,Amelia & Trent Burge Versus Cuvee Brut,1,& Trent Burge Versus Cuvee Brut bottle closure...,Amelia & Trent Burge Versus Cuvee Brut_1
0,Angullong Sparkling Rosé,0,Angullong Sparkling Rosé: at $23 per bottle. A...,Angullong Sparkling Rosé_0


**Add metadata to our data**

In [14]:
chunks_df['source']=  '../Reformatted_Wine_Descriptions_Enhanced NEW (2).txt'
chunks_df['title']= chunks_df['Product Name']

**Embeddings creation and storing in Pinecone**

In [18]:
PINE_CLOUD_ENVIRONMENT = os.getenv("gcp-starter")
index_name= "chainindex"
from pinecone import Pinecone
pc = Pinecone(api_key= "")

In [19]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.019,
 'namespaces': {'test1': {'vector_count': 950}, 'wine': {'vector_count': 950}},
 'total_vector_count': 1900}

In [20]:
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "" 

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

batch_size = 100

# function to remove non-ASCII characters from text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

# function to convert text to ASCII-compliant IDs
def convert_to_ascii_id(text):
    return re.sub(r'[^\x00-\x7F]+', '_', text)

for i in tqdm(range(0, len(chunks_df), batch_size)):
    i_end = min(len(chunks_df), i+batch_size)
    # get batch of df
    batch = chunks_df.iloc[i:i_end]
    # get IDs
    ids = [convert_to_ascii_id(x['id']) for _, x in batch.iterrows()]
    # get text to embed
    texts = [x['Chunks'] for _, x in batch.iterrows()]

    # remove non-ASCII characters from text
    ascii_texts = [remove_non_ascii(text) for text in texts]

    # embed documents
    embedd = embeddings.embed_documents(ascii_texts)

    # get metadata to store in Pinecone
    metadata = [
        {'text': text,
         'source': x['source'],
         'title': x['title']} for text, (_, x) in zip(ascii_texts, batch.iterrows())
    ]
    index.upsert(vectors=zip(ids, embedd, metadata), namespace= 'wine')


100%|███████████████████████████████████████████| 10/10 [02:31<00:00, 15.14s/it]
