<a href="https://colab.research.google.com/github/dey-hritam/ML-Projects/blob/main/Multiagent_Architecture_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch beautifulsoup4 requests pandas gradio



In [None]:
!pip install -U langchain-community langchain-openai



In [None]:
!pip install kaggle



In [91]:
import os
import torch
import requests
import pandas as pd
import subprocess
import json
import re
from typing import List, Dict, Optional
from dataclasses import dataclass
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain_community.retrievers import TavilySearchAPIRetriever
from pathlib import Path
import logging
from google.colab import files


In [92]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class Config:
    """Configuration settings for the application"""
    GITHUB_API_URL: str = "https://api.github.com/search/repositories"
    MAX_RETRIES: int = 3
    TIMEOUT: int = 10
    DEFAULT_NUM_RESULTS: int = 5
    MODEL_NAME: str = "gpt2"
    CSV_OUTPUT_PATH: Path = Path("/tmp/market_research.csv")

In [93]:
class APIKeyManager:
    """Handles API key setup and validation"""
    @staticmethod
    def setup_kaggle_credentials():
        print("Please upload your kaggle.json file for authentication:")
        uploaded = files.upload()  # This will prompt you to upload `kaggle.json`

        kaggle_dir = '/root/.kaggle'
        os.makedirs(kaggle_dir, exist_ok=True)
        kaggle_path = os.path.join(kaggle_dir, 'kaggle.json')

        with open(kaggle_path, 'wb') as f:
            f.write(uploaded['kaggle.json'])

        os.system(f'chmod 600 {kaggle_path}')
        print("Kaggle credentials configured successfully!")

    @staticmethod
    def setup_tavily_api():
        os.environ['TAVILY_API_KEY'] = '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'
        print("Tavily API key set successfully!")

In [94]:
class ResearchAgent:
    """Handles text generation using GPT-2"""
    def __init__(self, model_name: str = Config.MODEL_NAME):
        logger.info(f"Loading {model_name} model...")
        try:
            self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
            self.model = GPT2LMHeadModel.from_pretrained(model_name)
            self.model.eval()
            logger.info("Model loaded successfully!")
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            raise

    def generate_text(self, prompt: str, max_length: int = 150) -> str:
        try:
            inputs = self.tokenizer.encode(prompt, return_tensors='pt', truncation=True)
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_length=max_length,
                    num_return_sequences=1,
                    temperature=0.7,
                    pad_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=3
                )
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            logger.error(f"Text generation failed: {e}")
            return f"Error generating text: {str(e)}"

    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and normalize text"""
        return re.sub(r'\s+', ' ', text).strip()

In [95]:
class DatasetSearcher:
    """Handles dataset searches across different platforms"""
    def __init__(self):
        self.config = Config()

    def search_kaggle_datasets(self, query: str, num_results: int = Config.DEFAULT_NUM_RESULTS) -> List[str]:
        try:
            result = subprocess.run(
                ["kaggle", "datasets", "list", "-s", query, "--json"],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                timeout=self.config.TIMEOUT
            )
            if result.stderr:
                logger.warning(f"Kaggle API warning: {result.stderr.decode('utf-8')}")

            output = result.stdout.decode('utf-8')
            if not output.strip():
                return []

            datasets = json.loads(output)
            return [
                f"https://www.kaggle.com/datasets/{dataset['ownerAlias']}/{dataset['ref']}"
                for dataset in datasets[:num_results]
            ]
        except subprocess.TimeoutExpired:
            logger.error("Kaggle API request timed out")
            return []
        except Exception as e:
            logger.error(f"Kaggle search error: {e}")
            return []

    def search_github_datasets(self, query: str, num_results: int = Config.DEFAULT_NUM_RESULTS) -> List[str]:
        search_url = f"{self.config.GITHUB_API_URL}?q={query}+dataset+in:readme&sort=stars&order=desc"
        try:
            response = requests.get(
                search_url,
                timeout=self.config.TIMEOUT,
                headers={'Accept': 'application/vnd.github.v3+json'}
            )
            response.raise_for_status()
            data = response.json()
            return [
                repo['html_url'] for repo in data.get('items', [])[:num_results]
            ] if data.get('items') else []
        except requests.exceptions.RequestException as e:
            logger.error(f"GitHub API error: {e}")
            return []

In [96]:
class MarketResearchSystem:
    """Main system for conducting market research"""
    def __init__(self):
        try:
            self.agent = ResearchAgent()
            self.dataset_searcher = DatasetSearcher()
            self.api = KaggleApi()
            self.api.authenticate()
            self.tavily_client = TavilySearchAPIRetriever(
                api_key=os.environ['TAVILY_API_KEY']
            )
        except Exception as e:
            logger.error(f"Failed to initialize MarketResearchSystem: {e}")
            raise

    def search_web(self, query: str, num_results: int = 3) -> List[str]:
        try:
            search_results = self.tavily_client.search(query, num_results=num_results)
            return [result.text for result in search_results]
        except Exception as e:
            logger.error(f"Web search error: {e}")
            return []

    def analyze_industry(self, company_name: str) -> Dict:
        """Analyze company and generate insights"""
        try:
            research_prompt = f"Analyze the industry and market position of {company_name}. Consider their main products, services, and competitive advantages."
            ai_use_cases_prompt = f"List potential AI and machine learning use cases for {company_name} to improve their business operations and customer experience."

            analysis_results = {
                'company_name': company_name,
                'industry_analysis': self.agent.clean_text(
                    self.agent.generate_text(research_prompt)
                ),
                'ai_use_cases': self.agent.clean_text(
                    self.agent.generate_text(ai_use_cases_prompt)
                ),
                'relevant_datasets': (
                    self.dataset_searcher.search_kaggle_datasets(company_name) or
                    self.dataset_searcher.search_github_datasets(company_name)
                ),
                'web_search_results': self.search_web(company_name)
            }
            return analysis_results
        except Exception as e:
            logger.error(f"Analysis failed for {company_name}: {e}")
            raise


In [105]:
def save_as_csv(results: dict, file_path: str) -> None:
    """Save analysis results to CSV file"""
    use_cases = ["Industry Analysis", "AI Use Cases"]
    descriptions = [results['industry_analysis'], results['ai_use_cases']]

    # Handle 'relevant_datasets', ensuring it matches the length of `use_cases`
    references = results.get('relevant_datasets', ["No datasets found"])
    if len(references) < len(use_cases):
        references.extend(["No datasets found"] * (len(use_cases) - len(references)))
    else:
        references = references[:len(use_cases)]

    # Create DataFrame
    data = {
        "Use Cases": use_cases,
        "Description": descriptions,
        "Reference": references
    }
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)
    print(f"CSV file saved successfully at {file_path}")

In [101]:
def analyze_company(company_name: str) -> str:
    """Main function for the Gradio interface"""
    if not company_name.strip():
        return "Please enter a company name."

    try:
        system = MarketResearchSystem()
        results = system.analyze_industry(company_name)
        csv_path = str(Config.CSV_OUTPUT_PATH)
        save_as_csv(results, csv_path)
        return csv_path
    except Exception as e:
        logger.error(f"Analysis failed: {e}")
        return f"An error occurred: {str(e)}"

In [106]:
def main():
    """Initialize and launch the Gradio interface"""
    try:
        # Set up API credentials
        APIKeyManager.setup_kaggle_credentials()
        APIKeyManager.setup_tavily_api()

        # Create and launch Gradio interface
        iface = gr.Interface(
            fn=analyze_company,
            inputs=gr.Textbox(
                label="EnterIndustry name",
                placeholder="e.g., Steel Industry"
            ),
            outputs=gr.File(label="Download Analysis as CSV"),
            title="AI Market Research & Use Case Generator",
            description="Generate AI/ML use cases and market analysis for any company using GPT-2, Kaggle, GitHub dataset search, and Tavily web search.",
            examples=[["Steel Industry"], ["Food Industry"], ["Silicon Industry"]],
            theme=gr.themes.Base()
        )
        iface.launch(share=True)
    except Exception as e:
        logger.error(f"Failed to launch application: {e}")
        raise

if __name__ == "__main__":
    main()

Please upload your kaggle.json file for authentication:


Saving kaggle.json to kaggle.json
Kaggle credentials configured successfully!
Tavily API key set successfully!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9fda6e84424d5d9f11.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
