# Contract Analysis Assistant

This is an AI-powered system for contract analysis, generating insights and professional reports with minimal manual intervention.

### Key Features:

1. **Input Selection**:
   - Users upload contracts and supporting documents or integrate external legal APIs.

2. **AI Planning**:
   - A team of AI analysts is generated, each specializing in a specific domain (e.g., compliance, finance, operations).
   - `Human-in-the-loop` refines focus areas.

3. **AI Research**:
   - Analysts engage in multi-turn conversations with domain-specific AI experts.
   - Discussions cover strengths, weaknesses, risks, and improvements in the contract.

4. **Parallel Processing**:
   - Researches and data extraction run simultaneously using `map-reduce` for speed and scalability.

5. **Customizable Reports**:
   - Insights are synthesized into professional reports tailored to user needs.


In [1]:
%pip install --quiet -U langgraph langchain-community langchain-openai docx pinecone[grpc] ipywidgets PyPDF2 python-docx

Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain_community.vectorstores import Pinecone
from langchain_community.embeddings import OpenAIEmbeddings
from typing import List, Dict, Optional
from pydantic import BaseModel
import json
import os
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Now get the API key (it should be loaded from .env file)
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
# Set the OPENAI_API_KEY environment variable if it's found
if openai_api_key:
    os.environ["OPENAI_API_KEY"] = openai_api_key
else:
    print("Warning: OPENAI_API_KEY not found in environment variables.")
    print("Please make sure you have a .env file with OPENAI_API_KEY=your_api_key")

# Initialize Pinecone with GRPC client
if(pinecone_api_key):
    os.environ["PINECONE_API_KEY"] = pinecone_api_key
else:
    print("Warning: PINECONE_API_KEY not found in environment variables.")
    print("Please make sure you have a .env file with PINECONE_API_KEY=your_api_key")

## Clause Analysis Component Pinecone

In [5]:
class ClauseMetadata(BaseModel):
    jurisdiction: str
    version: str
    last_updated: str


class ClauseRetriever:
    def __init__(self, json_file_path: Optional[str] = None):
        # Initialize Pinecone with GRPC client
        self.pc = Pinecone(api_key=pinecone_api_key)

        self.index_name = "contract-clauses"
        self.embeddings = OpenAIEmbeddings()

        # Get index instance
        self.index = self.pc.Index(self.index_name)

        # Initialize vector store
        self.vectorstore = Pinecone(
            index=self.index,
            embedding=self.embeddings,
            text_key="text"
        )

        # Only load and index clauses if json_file_path is provided
        if json_file_path:
            self._load_clauses(json_file_path)

    def _load_clauses(self, json_file_path: str):
        """Load and index clauses from JSON file"""
        with open(json_file_path, 'r') as file:
            self.contract_types = json.load(file)

        # Process each contract type and its clauses
        for contract_data in self.contract_types:
            self._index_contract_clauses(contract_data)

    def _index_contract_clauses(self, contract_data: Dict):
        """Index clauses for a specific contract type"""
        contract_type = contract_data["contract_type"]

        vectors_to_upsert = []
        for clause in contract_data["clauses"]:
            # Create the text to be embedded
            clause_text = f"""
            Contract Type: {contract_type}
            Clause Title: {clause['clause_title']}

            {clause['clause_text']}

            """

            # Create metadata
            metadata = {
                "contract_type": contract_type,
                "clause_title": clause["clause_title"],
                "jurisdiction": clause["metadata"]["jurisdiction"],
                "version": clause["metadata"]["version"],
                "last_updated": clause["metadata"]["last_updated"],
                "text": clause_text
            }

            # Get vector embedding
            vector = self.embeddings.embed_query(clause_text)

            # Add to upsert batch
            vectors_to_upsert.append({
                "id": f"{contract_type}-{clause['clause_title']}".lower().replace(" ", "-"),
                "values": vector,
                "metadata": metadata
            })

            # Batch upsert in chunks of 100
            if len(vectors_to_upsert) >= 100:
                self.index.upsert(vectors=vectors_to_upsert)
                vectors_to_upsert = []

        # Upsert any remaining vectors
        if vectors_to_upsert:
            self.index.upsert(vectors=vectors_to_upsert)

    def get_clauses_by_contract_type(self,
                                    contract_type: str,
                                    jurisdiction: Optional[str] = None,
                                    k: int = 5) -> List[Dict]:
        """Retrieve relevant clauses based on contract type and optional filters"""
        # Build filter dict
        filter_dict = {"contract_type": contract_type}
        if jurisdiction:
            filter_dict["jurisdiction"] = jurisdiction

        # Create query vector
        query_text = f"Find clauses for {contract_type} contract"
        query_vector = self.embeddings.embed_query(query_text)

        # Search for relevant clauses
        results = self.index.query(
            vector=query_vector,
            top_k=k,
            filter=filter_dict,
            include_values=True,
            include_metadata=True
        )

        # Format results
        formatted_results = []
        for match in results['matches']:
            formatted_results.append({
                "clause_title": match['metadata']["clause_title"],
                "clause_text": match['metadata']["text"],
                "metadata": match['metadata'],
                "relevance_score": match['score']
            })

        return formatted_results

    def search_clauses(self,
                        query: str,
                        contract_type: Optional[str] = None,
                        jurisdiction: Optional[str] = None,
                        k: int = 5) -> List[Dict]:
        """Search for clauses based on semantic similarity"""
        # Build filter dict
        filter_dict = {}
        if contract_type:
            filter_dict["contract_type"] = contract_type
        if jurisdiction:
            filter_dict["jurisdiction"] = jurisdiction

        # Create query vector
        query_vector = self.embeddings.embed_query(query)

        # Perform search
        results = self.index.query(
            vector=query_vector,
            top_k=k,
            filter=filter_dict if filter_dict else None,
            include_values=True,
            include_metadata=True
        )

        # Format results
        formatted_results = []
        for match in results['matches']:
            formatted_results.append({
                "clause_title": match['metadata']["clause_title"],
                "clause_text": match['metadata']["text"],
                "metadata": match['metadata'],
                "relevance_score": match['score']
            })

        return formatted_results