In [6]:
pip install langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install unstructured

Collecting unstructuredNote: you may need to restart the kernel to use updated packages.

  Using cached unstructured-0.16.0-py3-none-any.whl.metadata (24 kB)
Collecting chardet (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting lxml (from unstructured)
  Downloading lxml-5.3.0-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Collecting nltk (from unstructured)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tabulate (from unstructured)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting beautifulsoup4 (from unstructured)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5

In [1]:
pip install langchain python-dotenv unstructured azure-openai

Collecting unstructured
  Downloading unstructured-0.16.0-py3-none-any.whl.metadata (24 kB)
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement azure-openai (from versions: none)
ERROR: No matching distribution found for azure-openai


In [1]:
import os
from typing import List, Dict
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import SequentialChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.callbacks import get_openai_callback
from langchain_openai import AzureChatOpenAI

In [2]:
class PDDAnalyzer:
    def __init__(self):
        # Initialize Azure OpenAI
        self.llm = AzureChatOpenAI(
            openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
            azure_deployment=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
            openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            model_name=os.getenv("LLM_MODEL"),
            temperature=0
        )
        
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=200
        )
        
    def load_document(self, file_path: str) -> str:
        """Load and process document from specified path"""
        try:
            loader = UnstructuredFileLoader(file_path)
            documents = loader.load()
            splits = self.text_splitter.split_documents(documents)
            return "\n".join([doc.page_content for doc in splits])
        except Exception as e:
            print(f"Error loading document: {str(e)}")
            raise

    def setup_prompts(self) -> Dict[str, PromptTemplate]:
        """Setup all prompt templates"""
        prompts = {
            "summarize": PromptTemplate(
                input_variables=["content"],
                template="""Summarize the following process design document content:
                {content}
                
                Provide a clear and concise summary focusing on:
                1. Main objectives
                2. Key processes
                3. System interactions
                """
            ),
            
            "extract_steps": PromptTemplate(
                input_variables=["content", "summary"],
                template="""Based on this process design document and its summary:
                Summary: {summary}
                Content: {content}
                
                List all the implementation steps in a clear, numbered format.
                For each step, include:
                - Step description
                - Input requirements
                - Expected output
                """
            ),
            
            "technical_solution": PromptTemplate(
                input_variables=["steps", "technology"],
                template="""Given these implementation steps:
                {steps}
                
                Provide a detailed technical solution using {technology} technology.
                Include:
                1. Specific components/modules needed
                2. Implementation approach for each step
                3. Potential technical challenges
                4. Integration points
                """
            ),
            
            "time_estimation": PromptTemplate(
                input_variables=["solution", "skill_level"],
                template="""Based on this technical solution:
                {solution}
                
                Estimate the implementation time for each component considering a {skill_level} developer.
                Provide:
                1. Time estimate per component
                2. Total project duration
                3. Potential bottlenecks
                4. Risk factors affecting timeline
                """
            ),
            
            "cost_estimation": PromptTemplate(
                input_variables=["time_estimate", "technology"],
                template="""Based on the implementation timeline:
                {time_estimate}
                
                Provide a detailed cost estimation for implementing this solution with {technology}.
                Include:
                1. Development costs (using standard rates)
                2. Infrastructure/hosting costs
                3. Third-party services/licenses
                4. Maintenance costs
                5. Total project cost range
                """
            )
        }
        return prompts

    def create_chains(self, prompts: Dict[str, PromptTemplate]) -> SequentialChain:
        """Create sequential chain from prompts"""
        chains = {
            "summary": LLMChain(
                llm=self.llm,
                prompt=prompts["summarize"],
                output_key="summary"
            ),
            "steps": LLMChain(
                llm=self.llm,
                prompt=prompts["extract_steps"],
                output_key="steps"
            ),
            "solution": LLMChain(
                llm=self.llm,
                prompt=prompts["technical_solution"],
                output_key="solution"
            ),
            "time": LLMChain(
                llm=self.llm,
                prompt=prompts["time_estimation"],
                output_key="time_estimate"
            ),
            "cost": LLMChain(
                llm=self.llm,
                prompt=prompts["cost_estimation"],
                output_key="cost_estimate"
            )
        }
        
        return SequentialChain(
            chains=[chains["summary"], chains["steps"], chains["solution"], 
                   chains["time"], chains["cost"]],
            input_variables=["content", "technology", "skill_level"],
            output_variables=["summary", "steps", "solution", "time_estimate", "cost_estimate"]
        )

    def analyze_pdd(self, file_path: str, technology: str, skill_level: str) -> Dict:
        """Main method to analyze PDD document"""
        try:
            content = self.load_document(file_path)
            prompts = self.setup_prompts()
            chain = self.create_chains(prompts)
            
            with get_openai_callback() as cb:
                result = chain({"content": content, 
                              "technology": technology,
                              "skill_level": skill_level})
                print(f"\nTotal Tokens Used: {cb.total_tokens}")
                print(f"Total Cost (USD): ${cb.total_cost}")
            
            return result
        except Exception as e:
            print(f"Error during analysis: {str(e)}")
            raise



In [3]:
# Example usage in Jupyter notebook

# First set up your environment variables
import os

os.environ["AZURE_OPENAI_API_VERSION"] = "2023-07-01-preview"
os.environ["AZURE_OPENAI_ENDPOINT"] = 'https://dskumar.openai.azure.com/'
os.environ["AZURE_OPENAI_API_KEY"] = "62855d6dd08945819bf83aee0c104127"
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = "DskumarDeployment"
os.environ['OPENAI_TYPE'] = "Azure"
os.environ["LLM_MODEL"] = "gpt-35-turbo-16k"
os.environ["LLM_EMBEDDING_MODEL"] = "dskumar-text-embedding-ada-002"



In [4]:
# Initialize analyzer
analyzer = PDDAnalyzer()

In [11]:
pip install langchain-unstructured

Collecting langchain-unstructured
  Downloading langchain_unstructured-0.1.5-py3-none-any.whl.metadata (3.2 kB)
Collecting unstructured-client<0.26.0,>=0.25.0 (from langchain-unstructured)
  Downloading unstructured_client-0.25.9-py3-none-any.whl.metadata (15 kB)
Collecting deepdiff>=6.0 (from unstructured-client<0.26.0,>=0.25.0->langchain-unstructured)
  Downloading deepdiff-8.0.1-py3-none-any.whl.metadata (8.5 kB)
Collecting orderly-set==5.2.2 (from deepdiff>=6.0->unstructured-client<0.26.0,>=0.25.0->langchain-unstructured)
  Downloading orderly_set-5.2.2-py3-none-any.whl.metadata (6.3 kB)
Downloading langchain_unstructured-0.1.5-py3-none-any.whl (7.0 kB)
Downloading unstructured_client-0.25.9-py3-none-any.whl (45 kB)
Downloading deepdiff-8.0.1-py3-none-any.whl (82 kB)
Downloading orderly_set-5.2.2-py3-none-any.whl (11 kB)
Installing collected packages: orderly-set, deepdiff, unstructured-client, langchain-unstructured
  Attempting uninstall: unstructured-client
    Found existing in

In [5]:
from langchain_unstructured import UnstructuredLoader

In [13]:
pip install pillow pi-heif urllib3

Collecting pi-heif
  Downloading pi_heif-0.20.0-cp311-cp311-win_amd64.whl.metadata (6.7 kB)
Downloading pi_heif-0.20.0-cp311-cp311-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 9.0 MB/s eta 0:00:00
Installing collected packages: pi-heif
Successfully installed pi-heif-0.20.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
from langchain_unstructured import UnstructuredLoader

In [6]:
# Analyze document
result = analyzer.analyze_pdd(
    file_path=r"C:\Users\817840\OneDrive - Cognizant\Documents\GitHub\ML-AI\Codes\Projects&POC\PGE - Self service chatbot\MRBR followups _Sweden_ PDD_V1.pdf",
    technology="Blue Brism",
    skill_level="intermediate"
)

  loader = UnstructuredFileLoader(file_path)
  from .autonotebook import tqdm as notebook_tqdm
Matplotlib is building the font cache; this may take a moment.
  "summary": LLMChain(
  result = chain({"content": content,



Total Tokens Used: 19539
Total Cost (USD): $0.06045299999999999


In [None]:


# Access results
print("Summary:", result["summary"])
print("\nSteps:", result["steps"])
print("\nTechnical Solution:", result["solution"])
print("\nTime Estimation:", result["time_estimate"])
print("\nCost Estimation:", result["cost_estimate"])

In [12]:
pip install python-magic-bin

Collecting python-magic-bin
  Downloading python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl.metadata (710 bytes)
Downloading python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl (409 kB)
Installing collected packages: python-magic-bin
Successfully installed python-magic-bin-0.4.14
Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install unstructured


Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install azure-openai


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement azure-openai (from versions: none)
ERROR: No matching distribution found for azure-openai


In [19]:
pip install python-magic-bin  # for Windows


Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [20]:
pip install pdf2image


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install pytesseract


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install pypdf


Note: you may need to restart the kernel to use updated packages.


In [23]:
pip install docx2txt

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install "unstructured[all-docs]"


In [None]:
pip install "unstructured-inference"


In [None]:
pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2


In [None]:
pip install layoutparser[layoutmodels,tesseract]

In [None]:
pip install pypdf


In [None]:
pip install python-docx


In [None]:
pip install docx2txt