**Libraries Required**

In [None]:
import re
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import json
import os
import csv
from tqdm.auto import tqdm
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter

**Extract Metadata**

In [5]:


# Define a function to extract wine information
def extract_wine_info(wine_entry):
    # Extract wine name
    wine_name = re.search(r"^(.*?) is priced at", wine_entry).group(1)
    
    # Extract vineyard
    vineyard_match = re.search(r"The brand of wine is (.*?)[.,\n]", wine_entry)
    vineyard = vineyard_match.group(1) if vineyard_match else "Not specified"
    
    # Extract grape varietal
    grape_varietal_match = re.search(r"The grape varietal is (.*?)[.,\n]", wine_entry)
    grape_varietal = grape_varietal_match.group(1) if grape_varietal_match else "Not specified"

    # Extract region of origin
    region_match = re.search(r"The region of origin is (.*?)[.,\n]", wine_entry)
    region = region_match.group(1) if region_match else "Not specified"
    
    # Extract vintage year
    vintage_match = re.search(r"The vintage is (\d{4})[.*\n]", wine_entry)
    vintage_year = int(vintage_match.group(1)) if vintage_match else 0
    
    
    price_match = re.search(r"priced at \$([\d.]+)", wine_entry)
    price = int(price_match.group(1)) if price_match else 0

    
    # Extract pairs well with or best matched
    pairing_match = re.search(r"(?:pair[s]? well with|best matched with) this wine is ([A-Za-z]+)", wine_entry, re.IGNORECASE)
    pairing = pairing_match.group(1) if pairing_match else "Not specified"
    
    return {
        "Wine Name": wine_name,
        "Vineyard": vineyard,
        "Grape Varietal": grape_varietal,
        "Region": region,
        "Vintage Year": vintage_year,
        "Price": price,
        "Pairs Well With or Best Matched": pairing
    }

# Open the file of documents
with open("path", "r") as file:
    # Read the content of the file
    wine_entries = file.read()

# Split wine entries into individual wines
wines = wine_entries.strip().split("\n\n")

# Extract information for each wine
wine_info_list = [extract_wine_info(wine) for wine in wines]


In [2]:
# Create a dataframe of all wine information
df1 = pd.DataFrame(wine_info_list)
df1["metadata"] = wine_info_list
pd.set_option('display.max_rows', 500)


In [3]:
#Convert in a correct format (metadata)
df1 = df1[['Wine Name', 'Vineyard', 'Grape Varietal', 'Region', 'Vintage Year','Price', 'Pairs Well With or Best Matched']]
df1['metadata'] = df1.to_dict(orient='records')

In [4]:
df1['Price']= df1['Price'].astype('int')

In [6]:
df= pd.read_csv("../Wine_data.csv")

In [6]:
#add metdata column in previous dataframe
df['metadata']= a['metadata'].copy()

**Chunks Creation**

In [7]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
chunk_dfs = []
for product in df['Wine Name'].unique():
    wine_data = df[df['Wine Name'] == product]['Other Data'].str.cat(sep=' ')
    wine_data = wine_data.replace("This", product).replace("The", product)
    chunks = text_splitter.split_text(wine_data)
    metadata = df[df['Wine Name'] == product]['metadata'].iloc[0]
    metadata_json = json.dumps(metadata)
    
    # Create a DataFrame to store the chunks and metadata
    new_df = pd.DataFrame({
        'Product Name': [product] * len(chunks),
        'Chunkid': range(len(chunks)),
        'Chunks': chunks,
        'Metadata': metadata_json
    })
    
    # Append the new DataFrame to the list
    chunk_dfs.append(new_df)

# Concatenate all chunk DataFrames into a single DataFrame
chunks_df = pd.concat(chunk_dfs, ignore_index=True)


**Combine Product Name and Chunkid to form unique Id**

In [8]:
#Create a id column using chunkid and product name
chunks_df['Chunkid'] = chunks_df['Chunkid'].astype("str")
chunks_df['id'] = chunks_df['Product Name'] + "_" + chunks_df['Chunkid']


In [12]:
# Function to add 'text' key to metadata dictionary in 'Metadata' column
def add_text_to_metadata(df):
    # Parse JSON in 'Metadata' column and add 'Chunks' as 'text' key
    df['Metadata'] = df.apply(lambda row: json.loads(row['Metadata']) if isinstance(row['Metadata'], str) else {}, axis=1)
    df['Metadata'] = df.apply(lambda row: {**row['Metadata'], 'text': row['Chunks']} if isinstance(row['Metadata'], dict) else {}, axis=1)
    return df

# Call the function to add the 'text' key to metadata
chunks_df = add_text_to_metadata(chunks_df)

# Now chunks_df will have the 'text' key added to the metadata dictionary
chunks_df.head()



Unnamed: 0,Product Name,Chunkid,Chunks,Metadata,id
0,Moorakyne Coonawarra Cabernet Sauvignon,0,Moorakyne Coonawarra Cabernet Sauvignon is pri...,{'Wine Name': 'Moorakyne Coonawarra Cabernet S...,Moorakyne Coonawarra Cabernet Sauvignon_0
1,Moorakyne Coonawarra Cabernet Sauvignon,1,Cabernet Sauvignon standard drinks per bottle ...,{'Wine Name': 'Moorakyne Coonawarra Cabernet S...,Moorakyne Coonawarra Cabernet Sauvignon_1
2,Riddoch Coonawarra Cabernet Sauvignon,0,Riddoch Coonawarra Cabernet Sauvignon is price...,{'Wine Name': 'Riddoch Coonawarra Cabernet Sau...,Riddoch Coonawarra Cabernet Sauvignon_0
3,Riddoch Coonawarra Cabernet Sauvignon,1,drinks per bottle is 8.3. Riddoch Coonawarra C...,{'Wine Name': 'Riddoch Coonawarra Cabernet Sau...,Riddoch Coonawarra Cabernet Sauvignon_1
4,Riddoch Coonawarra Cabernet Sauvignon,2,a bit of a punch. Long after taste and the win...,{'Wine Name': 'Riddoch Coonawarra Cabernet Sau...,Riddoch Coonawarra Cabernet Sauvignon_2


**Storing chunks in Pinecone**

In [14]:
# pinecone initialization

PINE_CLOUD_ENVIRONMENT = os.getenv("gcp-starter")
index_name= "langchainindex"
pc = Pinecone(api_key= "")

In [16]:
# Creating an index object with index_name
index = pc.Index(index_name)

# Calling the describe_index_stats() method to retrieve statistics about the index
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [3]:
#setting the environment variable
os.environ["OPENAI_API_KEY"] = "" 

embeddings = OpenAIEmbeddings()

# Function to remove non-ASCII characters from text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

# Function to convert text to ASCII-compliant IDs
def convert_to_ascii_id(text):
    return re.sub(r'[^\x00-\x7F]+', '_', text)

batch_size = 100
for i in tqdm(range(0, len(chunks_df), batch_size)):
    i_end = min(len(chunks_df), i+batch_size)
    batch = chunks_df.iloc[i:i_end]
    ids = [convert_to_ascii_id(x['id']) for _, x in batch.iterrows()]
    texts = [x['Chunks'] for _, x in batch.iterrows()]
    ascii_texts = [remove_non_ascii(text) for text in texts]

    # Extract metadata
    metadata_dicts = batch['Metadata']

    # Embed documents
    embedd = embeddings.embed_documents(ascii_texts)

    # Upsert documents into Pinecone index
    documents = zip(ids, embedd, metadata_dicts)
    index.upsert(vectors=documents, namespace='wine')
