<a href="https://colab.research.google.com/github/edquestofficial/gen-ai-projects/blob/main/Akash/LeiDataGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install llama-index-llms-openai
! pip install llama-index-vector-stores-pinecone
! pip install llama-index-readers-wikipedia
! pip install llama-index-embeddings-openai
! pip install arize-phoenix
! pip install llama-index-callbacks-arize-phoenix
! pip install llama-index-callbacks-arize-phoenix

# Visualize query pipeline
! pip install pyvis

Collecting llama-index-llms-openai
  Downloading llama_index_llms_openai-0.1.16-py3-none-any.whl (10 kB)
Collecting llama-index-core<0.11.0,>=0.10.24 (from llama-index-llms-openai)
  Downloading llama_index_core-0.10.34-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading dataclasses_json-0.6.5-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)
Collecting httpx (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
from pathlib import Path

data_dir = Path("/content/drive/MyDrive/Dump/")
csv_files = sorted([f for f in data_dir.glob("*.csv")])
dfs = []
for csv_file in csv_files:
    print(f"processing file: {csv_file}")
    try:
        df = pd.read_csv(csv_file)
        dfs.append(df)
    except Exception as e:
        print(f"Error parsing {csv_file}: {str(e)}")

processing file: /content/drive/MyDrive/Dump/citidata.csv
processing file: /content/drive/MyDrive/Dump/citilegalnamedata.csv


In [None]:
dfs[0].head()

Unnamed: 0.1,Unnamed: 0,LEI,Entity.LegalName,Entity.LegalName.xmllang,Entity.LegalAddress.xmllang,Entity.LegalAddress.FirstAddressLine,Entity.LegalAddress.MailRouting,Entity.LegalAddress.City,Entity.LegalAddress.Region,Entity.LegalAddress.Country,...,Registration.LastUpdateDate,Registration.RegistrationStatus,Registration.NextRenewalDate,Registration.ManagingLOU,Registration.ValidationSources,Registration.ValidationAuthority.ValidationAuthorityID,Registration.ValidationAuthority.ValidationAuthorityEntityID,Registration.OtherValidationAuthorities.OtherValidationAuthority.1.ValidationAuthorityID,Registration.OtherValidationAuthorities.OtherValidationAuthority.1.ValidationAuthorityEntityID,ConformityFlag
0,1518766,6SHGI4ZSSLCXXQSBB395,CITIGROUP INC.,en,en,CORPORATION TRUST CENTER 1209 ORANGE ST,c/o THE CORPORATION TRUST COMPANY,WILMINGTON,US-DE,US,...,2024-04-21T15:18:19+00:00,ISSUED,2025-05-27T14:10:35+00:00,529900T8BM49AURSDO55,FULLY_CORROBORATED,RA000602,2154254,RA000628,1688862.0,CONFORMING


In [None]:
tableinfo_dir = "LEI_TableInfo"
!mkdir {tableinfo_dir}

In [None]:
from llama_index.core.bridge.pydantic import BaseModel, Field

class TableInfo(BaseModel):
    """Information regarding a structured table."""

    table_name: str = Field(
        ..., description="table name (must be underscores and NO spaces)"
    )
    table_summary: str = Field(
        ..., description="short, concise summary/caption of the table"
    )

In [None]:
prompt_str = """\
Give me a summary of the table with the following json format.

- Table name must be unique to the table and describe it while being concise.
- Do not output a Generic table name (e.g. table, my_table)

Do not make table name one of the following: {exclude_table_name_list}

Table: {table_str}
Summary:
"""

In [None]:
import os
os.environ["OPENAI_API_KEY"] = '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'

In [None]:
from llama_index.core.program import LLMTextCompletionProgram
from llama_index.llms.openai import OpenAI


program = LLMTextCompletionProgram.from_defaults(
    output_cls = TableInfo,
    llm = OpenAI(model="gpt-3.5-turbo"),
    prompt_template_str = prompt_str,

)

In [None]:
import json

def _get_tableinfo_with_index(idx: int) -> str:
  results_gen = Path(tableinfo_dir).glob(f"{idx}_*")
  results_list = list(results_gen)
  if len(results_list) == 0:
    return None
  elif len(results_list) == 1:
    path = results_list[0]
    return TableInfo.parse_file(path)
  else:
    raise ValueError(
        f"More than one file matching index: {list(results_gen)}"
    )

table_names = set()
table_infos = []

for idx, df in enumerate(dfs):
  table_info = _get_tableinfo_with_index(idx)
  if table_info:
    table_infos.append(table_info)
  else:
    while True:
      df_str = df.head(10).to_csv()
      table_info = program(
          table_str = df_str,
          exclude_table_name_list = str(list(table_names))
      )
      table_name = table_info.table_name
      print(f"Processed table: {table_name}")
      if table_name not in table_names:
        table_names.add(table_name)
        break
      else:
        print(f"Table name {table_name} already exist, trying again.")
        pass

    out_file = f"{tableinfo_dir}/{idx}_{table_name}.json"
    json.dump(table_info.dict(), open(out_file, "w"))
  table_infos.append(table_info)

Processed table: Entity_Registration_Info
Processed table: Relationship_Info_Table


In [None]:
from sqlalchemy import (
    create_engine,
    MetaData,
    Table,
    Column,
    String,
    Integer,
)

In [None]:
import re

# Function to create a sanitized column name
def sanitize_column_name(col_name: str):
  # Remove special character and replace spaces with underscores
  return re.sub(r"\W+", "_", col_name)

# Function to create a table from a DataFrame using SQLAlchemy
def create_table_from_dataframe(
    df: pd.DataFrame,
    table_name: str,
    engine,
    metadata_obj):
  # Sanatize column name
  sanitized_columns = {col: sanitize_column_name(col) for col in df.columns}

  # Dynamically create columns based on DataFrame columns and data types
  columns = [
      Column(col, String if dtype == "Object" else Integer)
      for col, dtype in zip(df.columns, df.dtypes)
  ]

  # Create a Table with the defined columns
  table = Table(table_name, metadata_obj, *columns)

  # Create the table in database
  metadata_obj.create_all(engine)

  # Insert data from DataFrame into the table
  with engine.connect() as conn:
    for _, row in df.iterrows():
      insert_stmt = table.insert().values(**row.to_dict())
      conn.execute(insert_stmt)
    conn.commit()

engine = create_engine("sqlite:///:memory:")
metadata_obj = MetaData()
for idx, df in enumerate(dfs):
  tableinfo = _get_tableinfo_with_index(idx)
  print(f"Creating table: {tableinfo.table_name}")
  create_table_from_dataframe(df, tableinfo.table_name, engine, metadata_obj)

Creating table: Entity_Registration_Info
Creating table: Relationship_Info_Table


In [None]:
# setup Arize Phoenix for logging/observability
import phoenix as px
import llama_index.core

px.launch_app()
llama_index.core.set_global_handler("arize_phoenix")

🌍 To view the Phoenix app in your browser, visit https://voa1ab8n931-496ff2e9c6d22116-6006-colab.googleusercontent.com/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [None]:
from llama_index.core.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)
from llama_index.core import SQLDatabase, VectorStoreIndex

sql_database = SQLDatabase(engine)

table_node_mapping = SQLTableNodeMapping(sql_database)
table_schema_objs = [
    SQLTableSchema(table_name=t.table_name, context_str=t.table_summary)
    for t in table_infos
]  # add a SQLTableSchema for each table

obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
)
obj_retriever = obj_index.as_retriever(similarity_top_k=3)

In [None]:
from llama_index.core.retrievers import SQLRetriever
from typing import List
from llama_index.core.query_pipeline import FnComponent

sql_retriever = SQLRetriever(sql_database)


def get_table_context_str(table_schema_objs: List[SQLTableSchema]):
    """Get table context string."""
    context_strs = []
    for table_schema_obj in table_schema_objs:
        table_info = sql_database.get_single_table_info(
            table_schema_obj.table_name
        )
        if table_schema_obj.context_str:
            table_opt_context = " The table description is: "
            table_opt_context += table_schema_obj.context_str
            table_info += table_opt_context

        context_strs.append(table_info)
    return "\n\n".join(context_strs)


table_parser_component = FnComponent(fn=get_table_context_str)

In [None]:
from llama_index.core.prompts.default_prompts import DEFAULT_TEXT_TO_SQL_PROMPT
from llama_index.core import PromptTemplate
from llama_index.core.query_pipeline import FnComponent
from llama_index.core.llms import ChatResponse


def parse_response_to_sql(response: ChatResponse) -> str:
    """Parse response to SQL."""
    response = response.message.content
    sql_query_start = response.find("SQLQuery:")
    if sql_query_start != -1:
        response = response[sql_query_start:]
        # TODO: move to removeprefix after Python 3.9+
        if response.startswith("SQLQuery:"):
            response = response[len("SQLQuery:") :]
    sql_result_start = response.find("SQLResult:")
    if sql_result_start != -1:
        response = response[:sql_result_start]
    return response.strip().strip("```").strip()


sql_parser_component = FnComponent(fn=parse_response_to_sql)

text2sql_prompt = DEFAULT_TEXT_TO_SQL_PROMPT.partial_format(
    dialect=engine.dialect.name
)
print(text2sql_prompt.template)

Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer. You can order the results by a relevant column to return the most interesting examples in the database.

Never query for all the columns from a specific table, only ask for a few relevant columns given the question.

Pay attention to use only the column names that you can see in the schema description. Be careful to not query for columns that do not exist. Pay attention to which column is in which table. Also, qualify column names with the table name when needed. You are required to use the following format, each taking one line:

Question: Question here
SQLQuery: SQL Query to run
SQLResult: Result of the SQLQuery
Answer: Final answer here

Only use tables listed below.
{schema}

Question: {query_str}
SQLQuery: 
