- To see the images from visualisation, first create a `figures` folder. You can also change that folder name in the visualisation function.

- For visualisation, there are different types of charts, and each one has its own parameter requirements. Check the visualisation function cell for the expected parameters and try accordingly.

- For data analysis, same as visualisation, according to the type of analysis, you can change the `params` and test it.

In [1]:
import os
import io
import uuid
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fitz  # PyMuPDF
import pdfplumber
import docx
import pytesseract
from PIL import Image
from dotenv import load_dotenv
load_dotenv()


True

In [2]:
# File Upload & Extraction Functions

def extract_text_from_pdf(file_bytes: bytes) -> str:
    try:
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            text = "\n".join(page.extract_text() or "" for page in pdf.pages)
        return text.strip()
    except Exception:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        return text.strip()

def extract_text_from_docx(file_bytes: bytes) -> str:
    doc = docx.Document(io.BytesIO(file_bytes))
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_image(file_bytes: bytes) -> str:
    image = Image.open(io.BytesIO(file_bytes))
    return pytesseract.image_to_string(image)

def clean_text(text: str) -> str:
    return " ".join(text.split())

def standardize_tabular(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
    return df

def detect_file_type(filename: str) -> str:
    ext = os.path.splitext(filename)[1].lower()
    return ext

def upload_file(file_bytes: bytes, filename: str):
    ext = detect_file_type(filename)
    result = {}

    try:
        if ext in [".csv"]:
            df = pd.read_csv(io.BytesIO(file_bytes))
            df = standardize_tabular(df)
            result = {"type": "tabular", "data": df.to_dict(orient="records")}
        elif ext in [".xlsx"]:
            df = pd.read_excel(io.BytesIO(file_bytes))
            df = standardize_tabular(df)
            result = {"type": "tabular", "data": df.to_dict(orient="records")}
        elif ext in [".pdf"]:
            text = extract_text_from_pdf(file_bytes)
            result = {"type": "text", "data": clean_text(text)}
        elif ext in [".doc", ".docx"]:
            text = extract_text_from_docx(file_bytes)
            result = {"type": "text", "data": clean_text(text)}
        elif ext in [".txt"]:
            text = file_bytes.decode("utf-8", errors="ignore")
            result = {"type": "text", "data": clean_text(text)}
        elif ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
            text = extract_text_from_image(file_bytes)
            result = {"type": "text", "data": clean_text(text)}
        else:
            raise ValueError("Unsupported file type.")
    except Exception as e:
        raise ValueError(f"File processing error: {str(e)}")

    return result


In [3]:
# LLM Model API Call

TOGETHER_API_KEY =  os.getenv("TOGETHER_API_KEY")
TOGETHER_API_URL = "https://api.together.xyz/v1/chat/completions"

def ask_llm(question: str, context: dict) -> str:
    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
    payload = {
        "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 512,
        "temperature": 0.2,
    }
    headers = {"Authorization": f"Bearer {TOGETHER_API_KEY}"}
    response = requests.post(TOGETHER_API_URL, json=payload, headers=headers)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]


In [4]:
# Data Analysis Functions

def analyze_data(data, analysis_type, params):
    df = pd.DataFrame(data)
    if analysis_type == "describe":
        return df.describe().to_dict()
    elif analysis_type == "correlation":
        numeric_df = df.select_dtypes(include='number')
        return numeric_df.corr().to_dict()
    elif analysis_type == "groupby":
        group_col = params.get("group_col")
        agg_col = params.get("agg_col")
        agg_func = params.get("agg_func", "mean")
        return df.groupby(group_col)[agg_col].agg(agg_func).to_dict()
    elif analysis_type == "zscore_anomaly":
        col = params.get("col")
        zscores = (df[col] - df[col].mean()) / df[col].std()
        anomalies = df[abs(zscores) > 3]
        return anomalies.to_dict(orient="records")
    return {"error": "Unknown analysis type"}


In [5]:
# Visualization Functions

def generate_visualization(data, chart_type, params):
    df = pd.DataFrame(data)
    fig_id = str(uuid.uuid4())
    fig_path = os.path.join("figures", f"{fig_id}.png")

    if chart_type == "hist":
        col = params.get("col")
        plt.figure()
        sns.histplot(df[col])
        plt.savefig(fig_path)
    elif chart_type == "bar":
        x = params.get("x")
        y = params.get("y")
        plt.figure()
        sns.barplot(x=df[x], y=df[y])
        plt.savefig(fig_path)
    elif chart_type == "scatter":
        x = params.get("x")
        y = params.get("y")
        plt.figure()
        sns.scatterplot(x=df[x], y=df[y])
        plt.savefig(fig_path)
    elif chart_type == "box":
        col = params.get("col")
        plt.figure()
        sns.boxplot(y=df[col])
        plt.savefig(fig_path)
    elif chart_type == "heatmap":
        plt.figure()
        sns.heatmap(df.corr(), annot=True)
        plt.savefig(fig_path)
    else:
        return None
    plt.close()
    return f"/{fig_path.replace(os.sep, '/')}"


In [6]:
# Agent Class

class ChatAgent:
    def __init__(self):
        self.history = []
        self.context = {}
        self.tabular_data = None

    def store_context(self, result):
        self.context = result
        if result.get("type") == "tabular":
            self.tabular_data = result["data"]

    def get_context(self):
        return self.context

    def get_tabular_data(self):
        return self.tabular_data

    def add_to_history(self, question, answer):
        self.history.append({"question": question, "answer": answer})

    def get_history(self):
        return self.history


In [7]:
# Upload and parse file
agent = ChatAgent()

with open("test_data.csv", "rb") as f:
    file_bytes = f.read()
    filename = "test_data.csv"

result = upload_file(file_bytes, filename)
agent.store_context(result)
print("File Uploaded & Parsed:", result["type"])


File Uploaded & Parsed: tabular


In [8]:
# LLM Query

question = "explain in short the key insights dependencies between gender and income"
context = agent.get_context()
answer = ask_llm(question, context)
agent.add_to_history(question, answer)
print("LLM Answer:", answer)


LLM Answer: To analyze the dependency between gender and income based on the provided tabular data, let's first summarize the data by calculating the average income for each gender.

### Data Summary

1. **Female (F) Data Points:**
   - Alice (25, F, 50000)
   - Eve (40, F, 70000)
   - Grace (29, F, 62000)
   - Ivy (27, F, 51000)

2. **Male (M) Data Points:**
   - Bob (30, M, 60000)
   - Charlie (35, M, 55000)
   - David (28, M, 52000)
   - Frank (32, M, 48000)
   - Hank (31, M, 53000)
   - Jack (38, M, 75000)

### Calculating Average Income by Gender

1. **Average Income for Females:**
   - Total Income = 50000 + 70000 + 62000 + 51000 = 233000
   - Number of Females = 4
   - Average Income = 233000 / 4 = 58250

2. **Average Income for Males:**
   - Total Income = 60000 + 55000 + 52000 + 48000 + 53000 + 75000 = 343000
   - Number of Males = 6
   - Average Income = 343000 / 6 = 57166.67

### Insights

- The average income for females is approximately $58,250.
- The average income for ma

In [9]:
# Data Analysis Example

data = agent.get_tabular_data()
analysis_result = analyze_data(data, analysis_type="zscore_anomaly", params={"col": "income"})
print("Data Analysis Result:", analysis_result)


Data Analysis Result: []


In [None]:
# Visualization Example

fig_path = generate_visualization(result["data"], chart_type="bar", params={"x": "gender", "y": "income"})
print("Visualization saved at:", fig_path)


Visualization saved at: /figures/8fd10082-3f18-43cf-aa33-6984372938cc.png
