# Crop Data Vectorization with Pinecone and OpenAI

This notebook implements a vector embedding system for crop recommendations and World Bank data using:
- OpenAI's text-embedding-3-small for embeddings
- Pinecone for vector storage
- Pandas for data processing

## Import Required Libraries

In [7]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any
import os
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

load_dotenv()

# Initialize clients
openai_client = OpenAI()
pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

## Initialize Pinecone Index

In [8]:
# Create index if it doesn't exist
# Note: Uncomment if you need to create a new index

# pinecone_client.create_index(
#     "crop-data",
#     dimension=1536,  # OpenAI text-embedding-3-small dimension
#     metric="cosine",
#     spec=ServerlessSpec(cloud="aws", region="us-east-1")
# )

In [9]:
class CropDataVectorizer:
    def __init__(self, index_name: str = 'crop-data'):
        """
        Initialize the vectorizer with OpenAI and Pinecone clients
        """
        self.openai_client = OpenAI()
        self.index = pinecone_client.Index(index_name)
        print(f"Initialized CropDataVectorizer with index: {index_name}")

    def get_embedding(self, text: str) -> List[float]:
        """
        Get embeddings using OpenAI's API
        """
        print(f"Getting embedding for text: {text}")
        response = self.openai_client.embeddings.create(
            input=text,
            model="text-embedding-3-large"
        )
        print(f"Response from OpenAI API: {response}")  # Debugging print
        if 'data' in response and len(response['data']) > 0:
            embedding = response['data'][0]['embedding']
            print(f"Received embedding: {embedding}")
            return embedding
        else:
            print("No embedding received.")
            return []

    def process_crop_recommendations(self, csv_path: str) -> List[Dict[str, Any]]:
        """
        Process crop recommendations CSV and create embeddings
        """
        print(f"Processing crop recommendations from: {csv_path}")
        df = pd.read_csv(csv_path)
        print(f"Loaded DataFrame with {len(df)} rows.")  # Debugging print

        processed_data = []
        for idx, row in df.iterrows():
            # Combine relevant features into a text representation
            text_repr = f"Crop: {row['label']}. Suitable conditions: N={row['N']}, "
            text_repr += f"P={row['P']}, K={row['K']}, temperature={row['temperature']}, "
            text_repr += f"humidity={row['humidity']}, ph={row['ph']}, rainfall={row['rainfall']}"
            print(f"Text representation for crop {idx}: {text_repr}")
            # Create embedding
            embedding = self.get_embedding(text_repr)

            processed_data.append({
                "id": f"crop_{idx}",
                "values": embedding,
                "metadata": {
                    "crop_name": row['label'],
                    "text_description": text_repr,
                    "conditions": {
                        "N": float(row['N']),
                        "P": float(row['P']),
                        "K": float(row['K']),
                        "temperature": float(row['temperature']),
                        "humidity": float(row['humidity']),
                        "ph": float(row['ph']),
                        "rainfall": float(row['rainfall'])
                    }
                }
            })
            print(f"Processed crop {row['label']} with ID: crop_{idx}")

        print("Finished processing crop recommendations.")
        return processed_data

In [10]:
def process_world_bank_data(self, json_path: str) -> List[Dict[str, Any]]:
        """
        Process World Bank pink sheet JSON and create embeddings
        """
        print(f"Loading data from: {json_path}")  # Debugging print
        with open(json_path, 'r') as file:
            data = json.load(file)

        processed_data = []
        for section in data['sections']:
            print(f"Processing section: {section['title']}")  # Debugging print
            for idx, row in enumerate(section['data']):
                # Combine relevant features into a text representation
                text_repr = f"Commodity: {row['commodity']}. Price: {row['2024 Q2']} {row['unit']}. "
                text_repr += f"Year: 2024. Month: August. "  # Assuming we want the latest data
                text_repr += f"Category: {section['title']}."
                print(f"Creating embedding for commodity: {row['commodity']}")  # Debugging print

                # Create embedding
                embedding = self.get_embedding(text_repr)

                processed_data.append({
                    "id": f"wb_{idx}",
                    "values": embedding,
                    "metadata": {
                        "commodity": row['commodity'],
                        "price": float(row['2024 Q2']),
                        "unit": row['unit'],
                        "year": 2024,
                        "month": "August",
                        "category": section['title'],
                        "text_description": text_repr
                    }
                })

        print(f"Finished processing World Bank data. Total items processed: {len(processed_data)}")  # Debugging print
        return processed_data

In [11]:
def store_vectors(self, processed_data: List[Dict[str, Any]], namespace: str):
        """
        Store vectors in Pinecone
        """
        # Upsert data in batches of 100
        batch_size = 100
        for i in range(0, len(processed_data), batch_size):
            batch = processed_data[i:i + batch_size]
            print(f"Storing batch {i // batch_size + 1} with {len(batch)} items in namespace: {namespace}")  # Debugging print
            self.index.upsert(vectors=batch, namespace=namespace)

def query_similar(self, query_text: str, namespace: str, top_k: int = 5):
        """
        Query similar items from Pinecone
        """
        print(f"Querying for similar items to: '{query_text}' in namespace: {namespace} with top_k: {top_k}")  # Debugging print
        query_embedding = self.get_embedding(query_text)

        results = self.index.query(
            vector=query_embedding,
            namespace=namespace,
            top_k=top_k,
            include_metadata=True
        )

        print(f"Query completed. Found {len(results['matches'])} similar items.")  # Debugging print
        return results

## Example Usage

In [12]:
# Step 1: Initialize vectorizer
print("Initializing CropDataVectorizer...")  # Debugging print
vectorizer = CropDataVectorizer()

# Step 2: Process and store crop recommendations
print("Processing crop recommendations from 'Crop_recommendation.csv'...")  # Debugging print
crop_data = vectorizer.process_crop_recommendations("Crop_recommendation.csv")
print(f"Processed {len(crop_data)} crop recommendations.")  # Debugging print

# Step 3: Store crop recommendations in Pinecone
vectorizer.store_vectors(crop_data, namespace="crop_recommendations")
print("Stored crop recommendations in namespace 'crop_recommendations'.")  # Debugging print

# Step 4: Process and store World Bank data
print("Processing World Bank data from 'world_bank_prices.json'...")  # Debugging print
wb_data = vectorizer.process_world_bank_data("world_bank_prices.json")
print(f"Processed {len(wb_data)} World Bank data items.")  # Debugging print

# Step 5: Store World Bank data in Pinecone
vectorizer.store_vectors(wb_data, namespace="world_bank_prices")
print("Stored World Bank data in namespace 'world_bank_prices'.")  # Debugging print

Initializing CropDataVectorizer...
Initialized CropDataVectorizer with index: crop-data
Processing crop recommendations from 'Crop_recommendation.csv'...
Processing crop recommendations from: Crop_recommendation.csv
Loaded DataFrame with 100 rows.
Text representation for crop 0: Crop: rice. Suitable conditions: N=90, P=42, K=43, temperature=20.87974371, humidity=82.00274423, ph=6.502985292000001, rainfall=202.9355362
Getting embedding for text: Crop: rice. Suitable conditions: N=90, P=42, K=43, temperature=20.87974371, humidity=82.00274423, ph=6.502985292000001, rainfall=202.9355362


APIConnectionError: Connection error.

## Test Queries

In [None]:
# Example queries

# Query crop recommendations
crop_query = "crops suitable for high rainfall and acidic soil"
crop_results = vectorizer.query_similar(crop_query, namespace="crop_recommendations")
print("Similar crops:", crop_results)

# Query World Bank prices
price_query = "rice prices in recent months"
price_results = vectorizer.query_similar(price_query, namespace="world_bank_prices")
print("\nPrice data:", price_results)