## DA Agent

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from groq import Groq
import json
from dotenv import load_dotenv

In [3]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [4]:
GROQ_API_KEY = os.environ["GROQ_API_KEY"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [5]:
class DataAnalysisAgent:
  def __init__(self, model):
    self.data = None
    self.data_type = None
    self.analysis_results = {}
    self.supported_types = ["tabular", "json"]
    self.model = model
    self.client = Groq()

  def load_data(self, data_path: str):
    try:
      path = Path(data_path)

      if path.suffix == ".csv":
        self.data = pd.read_csv(path)
        self.data_type = "tabular"
      elif path.suffix == ".xlsx" or path.suffix == "xls":
        self.data = pd.read_excel(path)
        self.data_type = "tabular"
      elif path.suffix == ".json":
        self.data = pd.read_json(path)
        self.data_type = "json"
      else:
        raise Exception("File type not supported yet")

    except Exception as e:
      raise Exception(f"Error loading data: {str(e)}")

  # def get_columns_type(self):
  #   dtypes_dict = self.data.dtypes.to_dict()
  #   data_map = {}
  #   for key, value in dtypes_dict.items():
  #     if str(value) == "float64" or "int64":
  #       data_map[str(key)] = "numerical"
  #     elif str(value) == "bool":
  #       data_map[str(key)] = "bool"
  #     elif str(value) == "object":
  #       data_map[str(key)] = "categorical"
  #   return data_map


  # I am still trying to fix these
  def plot_distribution(self, column):
    if column not in self.data.columns:
      raise Exception("Column not present in the dataset")
    else:
      sns.histplot(self.data[column], kde=True)
      plt.title(f"Distribution of {column}")
      plt.show()


  def analyze(self):
    numeric_cols = self.data.select_dtypes(include=[np.number]).columns

    analysis = {
        'summary_stats': self.data.describe(include="all").to_dict(),
        'missing_values': self.data.isnull().sum().to_dict(),
        'unique_values': {col: self.data[col].nunique() for col in self.data.columns},
        'skewness': self.data.skew().to_dict(),
        'kurtosis': self.data.kurtosis().to_dict()
    }
    if len(numeric_cols) > 1:
      analysis['correlations'] = self.data[numeric_cols].corr().to_dict()
    for key, value in analysis.items():
      self.analysis_results[key] = value
    return analysis

  def _format_results(self, results, indent: int = 0) -> str:
      """Format analysis results for the report."""
      formatted = []
      for key, value in results.items():
          if isinstance(value, dict):
              formatted.append(f"{'  ' * indent}- {key}:")
              formatted.append(self._format_results(value, indent + 1))
          else:
              formatted.append(f"{'  ' * indent}- {key}: {value}")
      return "\n".join(formatted)

  def generate_report(self) -> str:
        """Generate a comprehensive analysis report."""

        report = [
            "# Data Analysis Report",
            "\n## Dataset Overview",
            f"- Data Type: {self.data_type}",
            f"- Dataset Shape: {self.data.shape}"
        ]

        # Add type-specific metadata
        if self.data_type == 'tabular':
          report.extend([
              f"- Rows: {self.data.shape[0]}",
              f"- Columns: {self.data.shape[1]}",
              "\n### Column Types:",
          ])
            # report.extend([

            #     "\n### Column Types:",
            #     *[f"- {k}: {len(v)} columns" for k, v in self.metadata['column_types'].items() if v]
            # ])

        # Add analysis results
        report.append("\n## Analysis Results")
        for analysis_type, results in self.analysis_results.items():
            report.extend([
                f"\n### {analysis_type.title()}",
                self._format_results(results)
            ])

        return "\n".join(report)

  def run_conversation(self, query):
    messages=[
        {
            "role": "system",
            "content": "You are a data analysis agent. You have a few functions available for analysis. You take in the data, decide which function to use and do you best to answer the user's query about the data"
        },
        {
            "role": "user",
            "content": query,
        }
    ]
    # Define the available tools (i.e. functions) for our model to use
    tools = [
        {
        "type": "function",
        "function": {
            "name": "analyze",
            "description": "Performs a detailed analysis of the dataset stored in the class. It generates statistical summaries, missing value counts, unique value counts, skewness, kurtosis, and correlations for numeric columns, providing key insights for exploratory data analysis (EDA).",
            "parameters": {
                "type": "object",
                "properties": {},  # No additional parameters required
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "generate_report",
            "description": "Generates a comprehensive analysis report of the dataset stored in the class. The report includes dataset overview, column types, and detailed analysis results such as summary statistics, missing values, unique values, skewness, kurtosis, and correlations.",
            "parameters": {
                "type": "object",
                "properties": {},  # No additional parameters required
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "plot_distribution",
            "description": "Plots the distribution of a specified column in the dataset using a histogram with a KDE overlay. This is useful for visualizing the frequency distribution and the underlying data shape.",
            "parameters": {
                "type": "object",
                "properties": {
                    "column": {
                        "type": "string",
                        "description": "The name of the column for which the distribution is to be plotted.",
                    },
                },
                "required": ["column"],
            },
        },
    }
    ]
    response = self.client.chat.completions.create(
        model=self.model,
        messages=messages,
        stream=False,
        tools=tools,
        tool_choice="auto",
        max_tokens=4096
    )
    response_message = response.choices[0].message
    tool_calls = response_message.tool_calls
    if tool_calls:
      available_functions = {
            "analyze": self.analyze,
            "generate_report": self.generate_report,
            "plot_distribution": self.plot_distribution
        }
      messages.append(response_message)


      for tool_call in tool_calls:
        function_name = tool_call.function.name
        function_to_call = available_functions.get(function_name)

        if not function_to_call:
            raise Exception(f"Function {function_name} is not available.")


        function_args = json.loads(tool_call.function.arguments)


        if function_name == "analyze":
            function_response = json.dumps(function_to_call(), default=str)
        elif function_name == "generate_report":
            function_response = function_to_call()
        elif function_name == "plot_distribution":
            column = function_args.get("column")
            if not column:
                raise Exception("Missing 'column' argument for plot_distribution.")
            function_response = function_to_call(column=column)
        else:
            raise Exception(f"Unexpected function: {function_name}")



        messages.append(
            {
                "tool_call_id": tool_call.id,
                "role": "tool",
                "name": function_name,
                "content": function_response,
            }
        )



    second_response = self.client.chat.completions.create(
        model=self.model,
        messages=messages
    )


    return second_response.choices[0].message.content

In [6]:
agent = DataAnalysisAgent(model="llama3-groq-70b-8192-tool-use-preview")
agent.load_data("/content/sample_data/california_housing_train.csv")

In [7]:
agent.run_conversation(query = "What are the mean, median, and standard deviation of the median_house_value column?")

'The mean of the median_house_value is 207,300.91, the median is 180,400, and the standard deviation is 115,983.76.'

In [8]:
agent.run_conversation(query = "Is there any negative correlation between numerical columns? If so, which ones?")

'There are a few negative correlations worth noting. For instance, there\'s a negative correlation between "housing_median_age" and "total_bedrooms" with a value of -0.320434. Also, "housing_median_age" has a negative correlation with "total_rooms" at -0.360984.'

In [9]:
agent.run_conversation(query = "What is the average median_house_value for each category of ocean_proximity?")

"The average median house value for each category of ocean proximity is as follows: 'NEAR BAY' with $604500, 'NEAR OCEAN' with $679000, 'INLAND' with $291000, and 'ISLAND' with $450000."

In [10]:
agent.run_conversation(query = "Which ocean_proximity category has the highest average median_income?")

"The ocean_proximity category with the highest average median_income is 'NEAR_BAY', with an average median income of $4.5."



---



## RAG

In [11]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [13]:
class RAG:
    data_agent: DataAnalysisAgent
    model_name: str
    pdf_directory: str

    def __init__(self, data_agent: DataAnalysisAgent, model_name: str, pdf_directory: str):
        self.data_agent = data_agent
        self.model_name = model_name
        self.pdf_directory = pdf_directory
        self.vector_store = None

    def load_and_process_pdfs(self):
        loader = PyPDFDirectoryLoader(self.pdf_directory)
        documents = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        splits = text_splitter.split_documents(documents)

        embeddings = OpenAIEmbeddings()
        self.vector_store = FAISS.from_documents(splits, embeddings)

    def create_chain(self):
        llm = ChatGroq(
            model_name=self.model_name
        )

        prompt = ChatPromptTemplate.from_template("""
        Context: {context}
        Data Analysis Results: {analysis_results}

        Question: {input}

        Provide an answer using both the context from the documents and the data analysis insights.
        If the answer is not provided in the analysis and the context, then simply say so and suggest the best possible solution.
        """)

        document_chain = create_stuff_documents_chain(
            llm=llm,
            prompt=prompt
        )

        retrieval_chain = create_retrieval_chain(
            self.vector_store.as_retriever(),
            document_chain
        )

        return retrieval_chain

    def query(self, question: str) -> str:
        if not self.vector_store:
            self.load_and_process_pdfs()

        analysis_results = self.data_agent.analyze()
        chain = self.create_chain()

        response = chain.invoke({
            "input": question,
            "analysis_results": json.dumps(analysis_results, default=str)
        })

        return response["answer"]

    def get_data_distribution(self, column: str):
        return self.data_agent.plot_distribution(column)

    def get_full_report(self) -> str:
        return self.data_agent.generate_report()

In [16]:
rag = RAG(data_agent=agent, model_name="llama-3.1-8b-instant", pdf_directory="/content/pdfs")

In [20]:
answer = rag.query("What are the mean, median, and standard deviation of the median_house_value column? Also, what is the best model for price prediction? How does the measures f central tendency affect the model selection?")

In [21]:
print(answer)

Based on the provided data analysis results, we can answer the questions as follows:

1. What are the mean, median, and standard deviation of the median_house_value column?

From the data analysis results, we can see that the median_house_value column has the following statistics:

- Mean: 207300.91235294117
- Median: 180400.0
- Standard Deviation: 115983.76438720913

2. What is the best model for price prediction?

According to the text, the best model for price prediction is the Random Forest model, which has an MSE (Mean Squared Error) of 0.290. This is based on the performance of each model on the test set, as shown in Table 3.

3. How do the measures of central tendency affect the model selection?

The measures of central tendency (mean, median, and standard deviation) can provide insights into the distribution of the data and help in model selection. In this case, the median_house_value column has a high standard deviation (115983.76438720913), indicating that the data is highly 