In [1]:
#!pip install  -q ibis-framework[mssql]
#!pip install     pyodbc
#!pip install     cachetools

In [2]:
endpoint = '2ddwq77iortevbfx7it6gysrxa-flcbzbaq54pulgpa4k3xjby76m.datawarehouse.fabric.microsoft.com'

**_<u><mark>Get connection</mark></u>_**

In [3]:
import ibis, sqlglot
from cachetools import cached, LRUCache
from ibis.expr.visualize import to_graph
def get_ibis_connection(endpoint ):
    import ibis, pyodbc , ibis.backends.mssql
    try:
        import struct 
        token = notebookutils.credentials.getToken('https://analysis.windows.net/powerbi/api').encode("UTF-16-LE")
        token_struct = struct.pack(f'<I{len(token)}s', len(token), token)
        connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server={endpoint};1433;Encrypt=Yes;TrustServerCertificate=No, attrs_before={1256: token_struct}"
    except:
        import configparser
        config = configparser.ConfigParser()
        config.read("C:/KV/variable.ini")
        client_id     =config.get("myvars", "appId")
        client_secret =config.get("myvars", "secret")
        connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server={endpoint};Authentication=ActiveDirectoryServicePrincipal;UID={client_id};PWD={client_secret};ConnectRetryCount=0"
    dwh_backend = ibis.backends.mssql.Backend()
    dwh_backend.con = pyodbc.connect(connection_string)
    return dwh_backend
@cached(cache=LRUCache(maxsize=32))
def sql_to_df(query,con):
    standard_sql = sqlglot.transpile(query, read= 'tsql', write='tsql')[0]
    import pyarrow as pa
    cursor = con.raw_sql(standard_sql)
    columns = [column[0] for column in cursor.description]
    data = cursor.fetchall()
    columnar_data = list(zip(*data))
    arrow_table = pa.Table.from_arrays([pa.array(col) for col in columnar_data], columns)
    return arrow_table

# Analysis

In [4]:
con = get_ibis_connection(endpoint)
sql_to_df.cache_clear()

In [5]:
# DMV (Dynamic Management View) 'dm_exec_describe_first_result_set' is not supported.
# https://github.com/ibis-project/ibis/issues/10199
# con.sql(sql, dialect="mssql").to_pyarrow()

In [6]:
con.list_catalogs()

['NYT', 'dwh', 'master']

In [7]:
%%time
sql = """   
            -- top do nothing, just want to use a tsql specific code
            SELECT  top 3
            date ,
            SUM(fare_amount) AS totalfares ,
            AVG(fare_amount) AS avgfares
            from NYT.ny.taxi 
            group by date
            order by date desc
      """
sql_to_df(sql,con).to_pandas()

CPU times: total: 62.5 ms
Wall time: 6.3 s


Unnamed: 0,date,totalfares,avgfares
0,2024-06-30,91916.48,19.213311
1,2024-06-29,1760749.29,19.429815
2,2024-06-28,1987240.99,20.890838


In [8]:
%%time
sql_to_df(sql,con).to_pandas()

CPU times: total: 0 ns
Wall time: 2 ms


Unnamed: 0,date,totalfares,avgfares
0,2024-06-30,91916.48,19.213311
1,2024-06-29,1760749.29,19.429815
2,2024-06-28,1987240.99,20.890838


using SQL Endpoint respect data masking policy

In [9]:
sql = """   
            SELECT * FROM dwh.dbo.EmployeeData;
      """
sql_to_df(sql,con).to_pandas()

Unnamed: 0,EmployeeID,FirstName,LastName,SSN,email
0,1,T-me,xxxx,XXX-XX-6789,email@youremail.com
1,2,F-me,xxxx,XXX-XX-0000,email2@youremail2.com


You can use Ibis Dataframe API, if you prefer that

In [10]:
from ibis import _
con.raw_sql(f" use NYT;")
taxi = con.table(name="taxi", database='ny')
agg = (taxi
       .filter(taxi.year == 2024)
       .group_by("date")
       .aggregate(totalfares=taxi.fare_amount.sum(), avgfares=taxi.fare_amount.mean())
       .mutate(rd  =_.totalfares/_.avgfares)
       .mutate(new =_.rd/10)
       .limit(10)
        )
ibis.to_sql(agg)

```sql
SELECT
TOP 10
  [t2].[date],
  [t2].[totalfares],
  [t2].[avgfares],
  CAST([t2].[totalfares] AS FLOAT) / [t2].[avgfares] AS [rd],
  CAST((
    CAST([t2].[totalfares] AS FLOAT) / [t2].[avgfares]
  ) AS FLOAT) / 10 AS [new]
FROM (
  SELECT
    [t1].[date],
    SUM([t1].[fare_amount]) AS [totalfares],
    AVG([t1].[fare_amount]) AS [avgfares]
  FROM (
    SELECT
      [t0].[trip_distance],
      [t0].[store_and_fwd_flag],
      [t0].[fare_amount],
      [t0].[extra],
      [t0].[mta_tax],
      [t0].[tip_amount],
      [t0].[tolls_amount],
      [t0].[improvement_surcharge],
      [t0].[total_amount],
      [t0].[airport_fee],
      [t0].[congestion_surcharge],
      [t0].[vendorid],
      [t0].[passenger_count],
      [t0].[pulocationid],
      [t0].[dolocationid],
      [t0].[payment_type],
      [t0].[ratecodeid],
      [t0].[tpep_pickup_datetime],
      [t0].[tpep_dropoff_datetime],
      [t0].[hour],
      [t0].[date],
      [t0].[file],
      [t0].[year]
    FROM [ny].[taxi] AS [t0]
    WHERE
      [t0].[year] = 2024
  ) AS [t1]
  GROUP BY
    [t1].[date]
) AS [t2]
```

In [11]:
agg.to_pandas()

Unnamed: 0,date,totalfares,avgfares,rd,new
0,2024-01-18,1861306.95,18.230592,102098.0,10209.8
1,2024-02-26,1664065.46,20.717431,80322.0,8032.2
2,2024-06-15,1865609.38,19.010061,98138.0,9813.8
3,2024-04-12,2037656.14,19.384465,105118.0,10511.8
4,2024-01-04,1846536.01,18.868196,97865.0,9786.5
5,2024-01-28,1545317.72,18.572861,83203.0,8320.3
6,2024-06-18,2028382.87,18.974405,106901.0,10690.1
7,2024-03-11,1768145.18,19.786542,89361.0,8936.1
8,2024-03-29,1889139.34,19.213215,98325.0,9832.5
9,2024-04-30,1950817.72,19.024944,102540.0,10254.0
