In [1]:
#!pip install  -q ibis-framework[mssql]
#!pip install     pyodbc
#!pip install     cachetools

In [2]:
endpoint = '2ddwq77iortevbfx7it6gysrxa-flcbzbaq54pulgpa4k3xjby76m.datawarehouse.fabric.microsoft.com'

**_<u><mark>Get connection</mark></u>_**

In [3]:
import ibis, sqlglot
from cachetools import cached, LRUCache
from ibis.expr.visualize import to_graph
def get_ibis_connection(endpoint ):
    import ibis, pyodbc , ibis.backends.mssql
    try:
        import struct 
        token = notebookutils.credentials.getToken('https://analysis.windows.net/powerbi/api').encode("UTF-16-LE")
        token_struct = struct.pack(f'<I{len(token)}s', len(token), token)
        connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server={endpoint};1433;Encrypt=Yes;TrustServerCertificate=No, attrs_before={1256: token_struct}"
    except:
        import configparser
        config = configparser.ConfigParser()
        config.read("C:/KV/variable.ini")
        client_id     =config.get("myvars", "appId")
        client_secret =config.get("myvars", "secret")
        connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server={endpoint};Authentication=ActiveDirectoryServicePrincipal;UID={client_id};PWD={client_secret};ConnectRetryCount=0"
    dwh_backend = ibis.backends.mssql.Backend()
    dwh_backend.con = pyodbc.connect(connection_string)
    return dwh_backend
@cached(cache=LRUCache(maxsize=32))
def sql_to_df(query,con):
    standard_sql = sqlglot.transpile(query, read= 'tsql', write='tsql')[0]
    import pyarrow as pa
    cursor = con.raw_sql(standard_sql)
    columns = [column[0] for column in cursor.description]
    data = cursor.fetchall()
    columnar_data = list(zip(*data))
    arrow_table = pa.Table.from_arrays([pa.array(col) for col in columnar_data], columns)
    return arrow_table

# Analysis

In [4]:
con = get_ibis_connection(endpoint)
sql_to_df.cache_clear()

In [5]:
# DMV (Dynamic Management View) 'dm_exec_describe_first_result_set' is not supported.
# con.sql(sql, dialect="mssql").to_pyarrow()

In [6]:
con.list_catalogs()

['NYT', 'dwh', 'master']

In [7]:
sql = """   
            -- top do nothing, just want to use a tsql specific code
            SELECT  top 30000
            date ,
            SUM(fare_amount) AS totalfares ,
            AVG(fare_amount) AS avgfares
            from NYT.ny.taxi 
            group by date
            order by date desc
      """
sql_to_df(sql,con).to_pandas()

Unnamed: 0,date,totalfares,avgfares
0,2024-06-30,91916.48,19.213311
1,2024-06-29,1760749.29,19.429815
2,2024-06-28,1987240.99,20.890838
3,2024-06-27,2366851.00,21.807863
4,2024-06-26,2250307.29,20.010380
...,...,...,...
4925,2011-01-05,4166602.43,9.526951
4926,2011-01-04,3949301.41,9.743615
4927,2011-01-03,3710331.64,9.935603
4928,2011-01-02,1652479.92,10.484214


In [8]:
sql_to_df(sql,con).to_pandas()

Unnamed: 0,date,totalfares,avgfares
0,2024-06-30,91916.48,19.213311
1,2024-06-29,1760749.29,19.429815
2,2024-06-28,1987240.99,20.890838
3,2024-06-27,2366851.00,21.807863
4,2024-06-26,2250307.29,20.010380
...,...,...,...
4925,2011-01-05,4166602.43,9.526951
4926,2011-01-04,3949301.41,9.743615
4927,2011-01-03,3710331.64,9.935603
4928,2011-01-02,1652479.92,10.484214


In [9]:
sql = """   
            SELECT * FROM dwh.dbo.EmployeeData;
      """
sql_to_df(sql,con).to_pandas()

Unnamed: 0,EmployeeID,FirstName,LastName,SSN,email
0,2,F-me,xxxx,XXX-XX-0000,email2@youremail2.com
1,1,T-me,xxxx,XXX-XX-6789,email@youremail.com


In [10]:
from ibis import _
con.raw_sql(f" use NYT;")
taxi = con.table(name="taxi", database='ny')
agg = (taxi
       .filter(taxi.year == 2024)
       .group_by("date")
       .aggregate(totalfares=taxi.fare_amount.sum(), avgfares=taxi.fare_amount.mean())
       .mutate(rd  =_.totalfares/_.avgfares)
       .mutate(new =_.rd/10)
       .limit(10)
        )
ibis.to_sql(agg)

```sql
SELECT
TOP 10
  [t2].[date],
  [t2].[totalfares],
  [t2].[avgfares],
  CAST([t2].[totalfares] AS FLOAT) / [t2].[avgfares] AS [rd],
  CAST((
    CAST([t2].[totalfares] AS FLOAT) / [t2].[avgfares]
  ) AS FLOAT) / 10 AS [new]
FROM (
  SELECT
    [t1].[date],
    SUM([t1].[fare_amount]) AS [totalfares],
    AVG([t1].[fare_amount]) AS [avgfares]
  FROM (
    SELECT
      [t0].[trip_distance],
      [t0].[store_and_fwd_flag],
      [t0].[fare_amount],
      [t0].[extra],
      [t0].[mta_tax],
      [t0].[tip_amount],
      [t0].[tolls_amount],
      [t0].[improvement_surcharge],
      [t0].[total_amount],
      [t0].[airport_fee],
      [t0].[congestion_surcharge],
      [t0].[vendorid],
      [t0].[passenger_count],
      [t0].[pulocationid],
      [t0].[dolocationid],
      [t0].[payment_type],
      [t0].[ratecodeid],
      [t0].[tpep_pickup_datetime],
      [t0].[tpep_dropoff_datetime],
      [t0].[hour],
      [t0].[date],
      [t0].[file],
      [t0].[year]
    FROM [ny].[taxi] AS [t0]
    WHERE
      [t0].[year] = 2024
  ) AS [t1]
  GROUP BY
    [t1].[date]
) AS [t2]
```

In [11]:
agg.to_pandas()

Unnamed: 0,date,totalfares,avgfares,rd,new
0,2024-01-23,1689706.43,18.077914,93468.0,9346.8
1,2024-03-08,1999284.49,18.862064,105995.0,10599.5
2,2024-01-10,1643905.6,18.274349,89957.0,8995.7
3,2024-03-30,1704450.37,18.324074,93017.0,9301.7
4,2024-03-05,1887682.47,18.560005,101707.0,10170.7
5,2024-03-29,1889139.34,19.213215,98325.0,9832.5
6,2024-04-23,1786225.95,19.22306,92921.0,9292.1
7,2024-04-09,2033958.44,19.808132,102683.0,10268.3
8,2024-01-04,1846536.01,18.868196,97865.0,9786.5
9,2024-05-25,1529344.49,19.202738,79642.0,7964.2
