
### Retrieve data from g-sheet by using Service Account

In [1]:
%pip install gspread gspread_dataframe

Collecting gspread
  Using cached gspread-6.2.1-py3-none-any.whl.metadata (11 kB)
Collecting gspread_dataframe
  Using cached gspread_dataframe-4.0.0-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting google-auth-oauthlib>=0.4.1 (from gspread)
  Using cached google_auth_oauthlib-1.2.3-py3-none-any.whl.metadata (3.1 kB)
Collecting google-auth>=1.12.0 (from gspread)
  Using cached google_auth-2.41.1-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib>=0.4.1->gspread)
  Using cached requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting cachetools<7.0,>=2.0.0 (from google-auth>=1.12.0->gspread)
  Using cached cachetools-6.2.4-py3-none-any.whl.metadata (5.6 kB)
Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib>=0.4.1->gspread)
  Using cached oauthlib-3.3.1-py3-none-any.whl.metadata (7.9 kB)
Using cached gspread-6.2.1-py3-none-any.whl (59 kB)
Using cached gspread_dataframe-4.0.0-py2.py3-none-any

## Step1: Extract Data

In [None]:
import json
import gspread
from gspread_dataframe import get_as_dataframe
import pandas as pd
from pyspark.sql.functions import current_timestamp
from datetime import date


#config Variable
SPREADSHEET_ID = '1iWmtI5iHzgxA9S9HdXloq7ak8VHA5mYO01f1MMaZ5-A'
SHEET_NAME ='ticket'
#Using Databricks secrets to access the service account key for security
SERVICE_ACCOUNT_JSON = dbutils.secrets.get(scope="ingestdata", key="service-account-key")
service_account_dict = json.loads(SERVICE_ACCOUNT_JSON)


gc = gspread.service_account_from_dict(service_account_dict)
sh = gc.open_by_key(SPREADSHEET_ID)
ws = sh.worksheet(SHEET_NAME)

# Extract to DataFrame
df = get_as_dataframe(ws, parse_date=True, evaluate_formulas=True)

#Convert to Spark, add metadata
sheet_df = spark.createDataFrame(df)
sheet_df = sheet_df.withColumn('Extracted At', current_timestamp())

# Validate extracted data
assert not sheet_df.count() < 0, "Extracted data is empty"

print(f"Extracted {sheet_df.count()} rows")


Extracted 7195 rows


## Step 2: Cleaning Column Names

In [8]:
# Sanitize column names: replace spaces and special characters with underscores
def sanitize_column(col_name):
    return (
        col_name.replace(" ", "_")
        .replace(".", "_")
        .replace("(", "")
        .replace(")", "")
        .replace("`", "")
        .replace(",", "")
        .replace(";", "")
        .replace("{", "")
        .replace("}", "")
        .replace("\n", "")
        .replace("\t", "")
        .replace("=", "_")
    )

raw_jira_data_clean = sheet_df.toDF(
    *[sanitize_column(col) for col in sheet_df.columns]
)

raw_jira_data_clean.createOrReplaceTempView("raw_jira_data_clean")

## Step 3: Loading Raw Data to SCR_JIRA table

In [9]:
from delta.tables import DeltaTable
import pyspark.sql.functions as F
from datetime import date


TARGET_TABLE = "looker_management_prod.bronze.scr_jira"
LOAD_DATE = date.today().isoformat()

##CREATE TABLE IF NOT EXISTS (infers schema from source)
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {TARGET_TABLE} 
        USING DELTA
        COMMENT 'Jira Data - Weekly Load'
        TBLPROPERTIES (
            delta.autoOptimize.optimizeWrite = true,
            delta.autoOptimize.autoCompact = true
        )
        AS SELECT * FROM raw_jira_data_clean
""")

#MERGE (UPSERT)
spark.sql(f"""
    MERGE INTO {TARGET_TABLE} AS t
    USING (
        SELECT * FROM raw_jira_data_clean
    ) s
    ON t.key = s.key
    AND t.Updated = s.Updated
    WHEN MATCHED THEN 
      UPDATE SET *
    
    WHEN NOT MATCHED THEN 
      INSERT *
          """)

print(f"Merged {spark.table(TARGET_TABLE).count()} rows")
display(spark.table(TARGET_TABLE))

Merged 7195 rows


Unnamed: 0,Issue_Type,Summary,Assignee,Status,Sprint,Est__Story_Points,Key,parent,Start_date,End_date,Updated,Labels,Assignee_accountId,Extracted_At
0,Story,SR0434551 - Request for change in logic for Connecting to agent time & Addition of extra measure & dimension [Case Analysis],Lila (Anh) Nguyen,Done,DDE PI18 Iteration 5,,PI4V2-24978,,,1/14/2026,1/14/2026 9:27:14,Case_Analysis_Explore;team#OFSERVICE,712020:33d80db1-1619-4469-8211-c2353f948a44,2026-01-15 15:04:47.236320
1,Story,SR0437985 - Remove backend filter for email message derived dimension [Case Analysis Explore],Lila (Anh) Nguyen,Done,DDE PI18 Iteration 5,,PI4V2-25045,,1/6/2026,1/14/2026,1/14/2026 9:25:21,Case_Analysis_Explore;team#OFSERVICE,712020:33d80db1-1619-4469-8211-c2353f948a44,2026-01-15 15:04:47.236320
2,Story,[OF Logic Change] Update New Office Code MDZBB (SKCW / SKLP),Phoebe (Phuc) Nguyen,Done,DDE PI18 Iteration 5,2.0,PI4V2-24897,,,1/14/2026,1/14/2026 9:24:00,OF_Logic_Change;Service_KPI_Chat&Web;Service_KPI_Landing_Page;team#OFSERVICE,712020:d790d2a3-f148-4613-89c3-f6fbc293b3a2,2026-01-15 15:04:47.236320
3,Story,Hotfix/New Enhancements/Packages to Space Yield Spoke (PI 18 Adhoc 2),An Banh,Done,DDE PI18 Iteration 5,23.0,PI4V2-24785,PI4V2-10651,1/13/2026,1/14/2026,1/14/2026 7:43:56,HUB;team#SPOKE5,712020:05c60d6a-1348-4bc0-b47b-e61b5a07fdef,2026-01-15 15:04:47.236320
4,Story,[COA/ COA Cost Detail- BAU Enhancement (SR0438351)] Logic Change for CM and Actual CM,Feliks (Phuoc) Vo,Done,DDE PI18 Iteration 5,,PI4V2-25039,PI4V2-12245,,1/14/2026,1/14/2026 7:35:46,team#SPOKE5,712020:43d4c234-a603-44ac-9e97-50843c9fc724,2026-01-15 15:04:47.236320
5,Story,[GTAS PIER US - BAU Hotfix] Update View: Date Granularity Month from mrkmon_desc to yrmon_desc,Rosezy (Tran) Huynh,Done,DDE PI18 Iteration 5,,PI4V2-25165,PI4V2-19346,,1/14/2026,1/14/2026 7:32:48,team#SPOKE5,712020:227fff45-6984-44ee-8b99-5ddfc331b33d,2026-01-15 15:04:47.236320
6,Story,"[GTAS PIERS US Dashboard - BAU Enhancement (SR0434501)] Add new tile ""Top 13 Carriers Market Volume Trend""",Rosezy (Tran) Huynh,Done,DDE PI18 Iteration 5,,PI4V2-25092,PI4V2-19346,,1/14/2026,1/14/2026 7:32:44,team#SPOKE5,712020:227fff45-6984-44ee-8b99-5ddfc331b33d,2026-01-15 15:04:47.236320
7,Story,"[Space Bottleneck - BAU Enhancement (SR0398690)] Add [Is Skip Calling or Deleted Port (Yes/No)] & Update to Always Filter as ""No""",Rosezy (Tran) Huynh,Done,DDE PI18 Iteration 5,2.0,PI4V2-25042,PI4V2-12399,,1/14/2026,1/14/2026 7:32:26,team#SPOKE5,712020:227fff45-6984-44ee-8b99-5ddfc331b33d,2026-01-15 15:04:47.236320
8,Story,[Space Bottleneck - BAU Enhancement (SR0398690-2)] Add [Is Virtual Port (Y/N)] &,Rosezy (Tran) Huynh,Done,DDE PI18 Iteration 5,1.0,PI4V2-23476,PI4V2-12399,,1/14/2026,1/14/2026 7:31:49,team#SPOKE5,712020:227fff45-6984-44ee-8b99-5ddfc331b33d,2026-01-15 15:04:47.236320
9,Story,[Datamyne US - BAU Support] Explore Decommission,Jake (Phuong) Tran,Done,DDE PI18 Iteration 5,0.5,PI4V2-25110,PI4V2-17098,,1/14/2026,1/14/2026 7:29:33,team#SPOKE5,712020:1b5c155a-b927-4546-be1c-98a5bba438e4,2026-01-15 15:04:47.236320


# Step 4: Loading data to SCR_JIRA_TRNS
Unpivot Sprint column in SCR_JIRA Table and load to SCR_JIRA_TRNS


In [11]:
TARGET_TABLE = "looker_management_prod.bronze.scr_jira_trns"

spark.sql(f"""
CREATE OR REPLACE TABLE {TARGET_TABLE} 
USING DELTA
COMMENT 'Transformed Jira Data'
TBLPROPERTIES (
    delta.autoOptimize.optimizeWrite = true,
    delta.autoOptimize.autoCompact = true
)
AS
WITH JIRA_TRANSF_01 AS (
    SELECT 
        CAST(Issue_Type AS STRING) AS TCK_TP,
        CAST(Summary AS STRING) AS TCK_NM,
        CAST(Assignee AS STRING) AS ASN_NM,
        CAST(Status AS STRING) AS TCK_STS,
        CAST(Sprint AS STRING) AS PI_ID,
        CAST(Est__Story_Points AS FLOAT) AS STR_PNT,
        CAST(`Key` AS STRING) AS TCK_ID,
        CAST(parent AS STRING) AS PRN_ID,
        TRY_CAST(Start_date AS DATE) AS STR_DT,
        TRY_CAST(End_date AS TIMESTAMP) AS END_DT,
        TRY_CAST(Updated AS TIMESTAMP) AS UPD_DT,
        CAST(Assignee_accountId AS STRING) AS DEV_ID
    FROM looker_management_prod.bronze.scr_jira
),
JIRA_TRNSF_02 AS (
    SELECT
        TCK_ID,
        COALESCE(PRN_ID,'Undefined')  AS PRN_ID,
        COALESCE(DEV_ID,'Undefined')  AS DEV_ID,
        COALESCE(PI_ID,'Undefined')   AS PI_ID,
        COALESCE(ASN_NM,'Undefined')  AS ASN_NM,
        COALESCE(TCK_NM,'Undefined')  AS TCK_NM,
        COALESCE(TCK_TP,'Undefined')  AS TCK_TP,
        COALESCE(TCK_STS,'Undefined') AS TCK_STS,
        COALESCE(STR_PNT,0)           AS STR_PNT,
        COALESCE(STR_DT,'1900-01-01') AS STR_DT,
        COALESCE(END_DT,'1900-01-01') AS END_DT,
        COALESCE(UPD_DT,'1900-01-01') AS UPD_DT
    FROM JIRA_TRANSF_01
),
JIRA_TRNSF_03 AS (
    SELECT 
        TCK_ID,
        PRN_ID,
        DEV_ID,
        PI_ID_EXPLODED AS PI_ID,
        ASN_NM,
        TCK_NM,
        TCK_TP,
        TCK_STS,
        STR_PNT,
        STR_DT,
        END_DT,
        UPD_DT
    FROM JIRA_TRNSF_02
    LATERAL VIEW explode(split(PI_ID, ';')) AS PI_ID_EXPLODED
)
SELECT
    TCK_ID,
    PRN_ID,
    DEV_ID,
    PI_ID,
    ASN_NM,
    TCK_NM,
    TCK_TP,
    TCK_STS,
    STR_PNT,
    STR_DT,
    END_DT,
    UPD_DT
FROM JIRA_TRNSF_03
WHERE TCK_ID IS NOT NULL
""")

Unnamed: 0,num_affected_rows,num_inserted_rows
