In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# PhonePe Pulse API Ingestion – Transactions (Aggregated)

This notebook performs API-based ingestion of aggregated transaction data from the official PhonePe Pulse repository.

Objective:
- Fetch quarterly transaction JSON files
- Parse nested transaction structures
- Convert them into structured tabular format
- Export clean CSV for warehouse loading

This notebook is part of the PhonePe Fintech Performance Analytics project.


In [2]:
# Basic environment check
import os
import sys

print("Python version:", sys.version)
print("Current working directory:", os.getcwd())


Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Current working directory: /kaggle/working


## 1. Import Required Libraries

In this step, we import the necessary Python libraries required for:

- Making API calls
- Handling JSON data
- Data transformation
- DataFrame creation
- File export

These libraries support the ETL ingestion pipeline.


In [3]:
import requests
import json
import os
from time import sleep


## 2. Define API Source and Base URL

Here we define the base GitHub raw URL from which quarterly JSON files will be fetched.

Data Source:
Official PhonePe Pulse public repository.

This ensures traceability and reproducibility of the ingestion process.


In [4]:
# -----------------------------
# Data Source
# -----------------------------

BASE_URL =  "https://raw.githubusercontent.com/PhonePe/pulse/master/data/aggregated/transaction/country/india/state"




START_YEAR = 2018
END_YEAR = 2024
QUARTERS = [1, 2, 3, 4]

OUTPUT_DIR = "/kaggle/working/transactions_raw"


## 2. Create raw data directory


In [5]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Raw transactions directory created at:")
print(OUTPUT_DIR)


Raw transactions directory created at:
/kaggle/working/transactions_raw


In [6]:
# ----------------------------------------
# Fetch list of states using GitHub API
# ----------------------------------------

github_api_url = (
    "https://api.github.com/repos/PhonePe/pulse/contents/"
    "data/aggregated/transaction/country/india/state"
)

response = requests.get(github_api_url)

if response.status_code == 200:
    contents = response.json()
    print("GitHub directory fetched successfully")
else:
    raise Exception("Failed to fetch state directory from GitHub")


GitHub directory fetched successfully


In [7]:
# Extract folder names (states)
states = [
    item["name"].lower()
    for item in contents
    if item["type"] == "dir"
]

print("Total states found:", len(states))
print("Sample states:", states[:5])


Total states found: 36
Sample states: ['andaman-&-nicobar-islands', 'andhra-pradesh', 'arunachal-pradesh', 'assam', 'bihar']


In [8]:
test_url = (
    "https://raw.githubusercontent.com/PhonePe/pulse/master/"
    "data/aggregated/transaction/country/india/state/"
    "maharashtra/2018/1.json"
)

r = requests.get(test_url)
print(r.status_code)
print(r.json().keys() if r.status_code == 200 else r.text[:200])


200
dict_keys(['success', 'code', 'data', 'responseTimestamp'])


In [9]:
from urllib.parse import quote

## 3. Define Time Coverage (Year × Quarter)

We define the time range for ingestion.

Transactions dataset covers:
2018 Q1 to 2024 Q4.

We will iterate through each year and quarter combination to dynamically construct file URLs.


## 4. Fetch JSON Data from API

In this step, we programmatically fetch JSON files using HTTP requests.

For each year and quarter:
- Construct file URL
- Send request
- Validate response status
- Store JSON data for further parsing

This ensures complete coverage across all quarters.


In [10]:
failed_requests = []
success_count = 0

for state in states:
    for year in range(START_YEAR, END_YEAR + 1):
        for quarter in [1, 2, 3, 4]:

            url = f"{BASE_URL}/{state}/{year}/{quarter}.json"
            response = requests.get(url)

            if response.status_code == 200:
                data = response.json()

                file_name = f"{state}_{year}_Q{quarter}.json"
                file_path = os.path.join(OUTPUT_DIR, file_name)

                with open(file_path, "w") as f:
                    json.dump(data, f)

                success_count += 1

            else:
                failed_requests.append({
                    "state": state,
                    "year": year,
                    "quarter": quarter,
                    "status": response.status_code
                })

            sleep(0.1)  # polite rate limit


In [11]:
print("Total failed requests:", len(failed_requests))
failed_requests[:5]

Total failed requests: 0


[]

In [12]:
import os

files = os.listdir(OUTPUT_DIR)
print("Total JSON files saved:", len(files))
print("Sample files:", files[:5])


Total JSON files saved: 1008
Sample files: ['telangana_2021_Q1.json', 'lakshadweep_2024_Q2.json', 'jharkhand_2024_Q3.json', 'uttarakhand_2019_Q2.json', 'rajasthan_2024_Q1.json']


In [13]:
import re

files = os.listdir(OUTPUT_DIR)

years = sorted(
    set(int(re.search(r'_(\d{4})_', f).group(1)) for f in files)
)

print("Years present:", years)


Years present: [2018, 2019, 2020, 2021, 2022, 2023, 2024]


In [14]:
quarters = sorted(
    set(re.search(r'_Q(\d)', f).group(1) for f in files)
)

print("Quarters present:", quarters)


Quarters present: ['1', '2', '3', '4']


In [15]:
states_present = sorted(
    set(f.split('_')[0] for f in files)
)

print("Total states captured:", len(states_present))
print("Sample states:", states_present[:10])


Total states captured: 36
Sample states: ['andaman-&-nicobar-islands', 'andhra-pradesh', 'arunachal-pradesh', 'assam', 'bihar', 'chandigarh', 'chhattisgarh', 'dadra-&-nagar-haveli-&-daman-&-diu', 'delhi', 'goa']


## 5. Inspect Sample JSON Structure

Before flattening, we inspect one sample JSON file to understand:

- Nested structure
- Transaction categories
- Payment instrument level metrics

This helps identify correct keys for parsing.


In [16]:
import json

sample_file = files[0]
with open(os.path.join(OUTPUT_DIR, sample_file), "r") as f:
    data = json.load(f)

print(data.keys())


dict_keys(['success', 'code', 'data', 'responseTimestamp'])


In [17]:
data['data'].keys()

dict_keys(['from', 'to', 'transactionData'])

In [18]:
data['data']['transactionData'][0].keys()


dict_keys(['name', 'paymentInstruments'])

In [19]:
import os
import json

raw_dir = OUTPUT_DIR
sample_file = os.listdir(raw_dir)[0]

with open(os.path.join(raw_dir, sample_file), "r") as f:
    sample_data = json.load(f)

sample_data.keys()


dict_keys(['success', 'code', 'data', 'responseTimestamp'])

In [20]:
sample_data['data'].keys()


dict_keys(['from', 'to', 'transactionData'])

## 6. Extract and Flatten Transaction JSON Structure

The JSON file contains nested transaction data.

We extract:
- State (from file name)
- Year and Quarter
- Transaction type
- Transaction count
- Transaction amount

We loop through transaction categories and payment instruments,
then append structured rows into a list.

This converts nested JSON into flat tabular format.


In [21]:
parsed_rows = []

file_name = sample_file
state, year, q = file_name.replace(".json", "").split("_")
quarter = int(q.replace("Q", ""))

for tx in sample_data['data']['transactionData']:
    tx_type = tx['name']
    
    for instr in tx['paymentInstruments']:
        parsed_rows.append({
            "state": state,
            "year": int(year),
            "quarter": quarter,
            "transaction_type": tx_type,
            "transaction_count": instr['count'],
            "transaction_amount_rupees": instr['amount']
        })

parsed_rows[:5]


[{'state': 'telangana',
  'year': 2021,
  'quarter': 1,
  'transaction_type': 'Merchant payments',
  'transaction_count': 185230619,
  'transaction_amount_rupees': 95927278280.53006},
 {'state': 'telangana',
  'year': 2021,
  'quarter': 1,
  'transaction_type': 'Peer-to-peer payments',
  'transaction_count': 179738195,
  'transaction_amount_rupees': 676276353116.8419},
 {'state': 'telangana',
  'year': 2021,
  'quarter': 1,
  'transaction_type': 'Recharge & bill payments',
  'transaction_count': 46371555,
  'transaction_amount_rupees': 26447601338.53346},
 {'state': 'telangana',
  'year': 2021,
  'quarter': 1,
  'transaction_type': 'Financial Services',
  'transaction_count': 262473,
  'transaction_amount_rupees': 320153410.17969453},
 {'state': 'telangana',
  'year': 2021,
  'quarter': 1,
  'transaction_type': 'Others',
  'transaction_count': 969268,
  'transaction_amount_rupees': 387877900.5044867}]

## 7. Convert Parsed Records into DataFrame

After flattening JSON records, we convert the list of dictionaries into a Pandas DataFrame.

This structured format enables:
- Data validation
- Cleaning
- Aggregation
- Export to CSV


In [22]:
all_rows = []

for file in os.listdir(raw_dir):
    with open(os.path.join(raw_dir, file), "r") as f:
        data = json.load(f)

    state, year, q = file.replace(".json", "").split("_")
    quarter = int(q.replace("Q", ""))

    for tx in data['data']['transactionData']:
        tx_type = tx['name']
        
        for instr in tx['paymentInstruments']:
            all_rows.append({
                "state": state,
                "year": int(year),
                "quarter": quarter,
                "transaction_type": tx_type,
                "transaction_count": instr['count'],
                "transaction_amount_rupees": instr['amount']
            })

df = pd.DataFrame(all_rows)

## 8. Basic Data Validation

We perform initial validation checks:

- Preview first few rows
- Verify column names
- Check data types
- Validate record counts

This ensures ingestion integrity before export.


In [23]:
df.shape

(5034, 6)

In [24]:
df.head()

Unnamed: 0,state,year,quarter,transaction_type,transaction_count,transaction_amount_rupees
0,telangana,2021,1,Merchant payments,185230619,95927280000.0
1,telangana,2021,1,Peer-to-peer payments,179738195,676276400000.0
2,telangana,2021,1,Recharge & bill payments,46371555,26447600000.0
3,telangana,2021,1,Financial Services,262473,320153400.0
4,telangana,2021,1,Others,969268,387877900.0


## 9. Create Derived Metrics (Standardization)

For analytical convenience, we standardize transaction value into Crores.

This improves readability in downstream SQL and BI layers.

No business logic is applied here — only unit normalization.


In [25]:
df['transaction_amount_crore'] = df['transaction_amount_rupees'] / 10_000_000
df['avg_transaction_value'] = (
    df['transaction_amount_rupees'] / df['transaction_count']
)


In [26]:
df[['transaction_amount_crore', 'avg_transaction_value']].describe()

Unnamed: 0,transaction_amount_crore,avg_transaction_value
count,5034.0,5034.0
mean,6863.772245,1363.627994
std,26851.996311,1342.3406
min,3e-06,17.198606
25%,3.993888,513.004282
50%,43.941388,803.358552
75%,1102.822217,1523.217541
max,309566.628841,7767.539936


In [27]:
df.head()

Unnamed: 0,state,year,quarter,transaction_type,transaction_count,transaction_amount_rupees,transaction_amount_crore,avg_transaction_value
0,telangana,2021,1,Merchant payments,185230619,95927280000.0,9592.727828,517.880245
1,telangana,2021,1,Peer-to-peer payments,179738195,676276400000.0,67627.635312,3762.563395
2,telangana,2021,1,Recharge & bill payments,46371555,26447600000.0,2644.760134,570.341049
3,telangana,2021,1,Financial Services,262473,320153400.0,32.015341,1219.7575
4,telangana,2021,1,Others,969268,387877900.0,38.78779,400.176113


In [28]:
state_to_region = {
    # North
    "jammu-and-kashmir": "North",
    "himachal-pradesh": "North",
    "punjab": "North",
    "haryana": "North",
    "delhi": "North",
    "uttarakhand": "North",
    "uttar-pradesh": "North",
    "chandigarh": "North",

    # South
    "andhra-pradesh": "South",
    "telangana": "South",
    "karnataka": "South",
    "tamil-nadu": "South",
    "kerala": "South",
    "puducherry": "South",
    "lakshadweep": "South",

    # West
    "rajasthan": "West",
    "gujarat": "West",
    "maharashtra": "West",
    "goa": "West",
    "dadra-and-nagar-haveli-and-daman-and-diu": "West",

    # East
    "bihar": "East",
    "jharkhand": "East",
    "odisha": "East",
    "west-bengal": "East",

    # Central
    "madhya-pradesh": "Central",
    "chhattisgarh": "Central",

    # North-East
    "assam": "North-East",
    "arunachal-pradesh": "North-East",
    "manipur": "North-East",
    "meghalaya": "North-East",
    "mizoram": "North-East",
    "nagaland": "North-East",
    "tripura": "North-East",
    "sikkim": "North-East",
    "andaman-and-nicobar-islands": "North-East"
}


In [29]:
df['region'] = df['state'].map(state_to_region)

In [30]:
df.isna().sum()


state                          0
year                           0
quarter                        0
transaction_type               0
transaction_count              0
transaction_amount_rupees      0
transaction_amount_crore       0
avg_transaction_value          0
region                       560
dtype: int64

In [31]:
df[['state', 'region']].drop_duplicates().sort_values('region').head(15)


Unnamed: 0,state,region
50,madhya-pradesh,Central
70,chhattisgarh,Central
888,odisha,East
10,jharkhand,East
160,west-bengal,East
100,bihar,East
438,uttar-pradesh,North
15,uttarakhand,North
418,haryana,North
290,himachal-pradesh,North


In [32]:
missing_states = (
    df[df['region'].isna()]['state']
    .value_counts()
)

missing_states


state
dadra-&-nagar-haveli-&-daman-&-diu    140
andaman-&-nicobar-islands             140
jammu-&-kashmir                       140
ladakh                                140
Name: count, dtype: int64

In [33]:
state_to_region.update({
    "jammu-&-kashmir": "North",
    "ladakh": "North",
    "andaman-&-nicobar-islands": "North-East",
    "dadra-&-nagar-haveli-&-daman-&-diu": "West"
})


In [34]:
df['region'] = df['state'].map(state_to_region)


In [35]:
df.isna().sum()


state                        0
year                         0
quarter                      0
transaction_type             0
transaction_count            0
transaction_amount_rupees    0
transaction_amount_crore     0
avg_transaction_value        0
region                       0
dtype: int64

In [36]:
df = df.sort_values(by=['state', 'year', 'quarter', 'transaction_type']).reset_index(drop=True)

In [37]:
df[['state', 'region']].drop_duplicates().head(36)

Unnamed: 0,state,region
0,andaman-&-nicobar-islands,North-East
140,andhra-pradesh,South
280,arunachal-pradesh,North-East
420,assam,North-East
560,bihar,East
700,chandigarh,North
840,chhattisgarh,Central
980,dadra-&-nagar-haveli-&-daman-&-diu,West
1120,delhi,North
1260,goa,West


## 10. Export Structured Dataset

The cleaned and structured dataset is exported as CSV.

This file will serve as the source input for:

- SQL warehouse loading
- Star schema modeling
- Analytical view creation

This completes the ingestion phase for aggregated transactions.


In [38]:
final_path = "/kaggle/working/phonepe_transactions_aggregated_state_level_2018_2024.csv"
df.to_csv(final_path, index=False)

print("Saved:", final_path)
print("Shape:", df.shape)


Saved: /kaggle/working/phonepe_transactions_aggregated_state_level_2018_2024.csv
Shape: (5034, 9)


## Conclusion

This notebook successfully:

- Extracted quarterly aggregated transaction data
- Flattened nested JSON structures
- Validated record integrity
- Generated warehouse-ready CSV output

Next Step:
Load the structured dataset into MySQL for analytical modeling.
