### Crypto Data API pipeline
- Get data from CoinGecko API
- Transform and get the data we need from the whole JSON set
- Load into a db that isn't on port 5432, change in postgresql.conf file where port = 5432 to port = 8000 or sth else


#### 1. Extract data from API

In [26]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()
API_KEY = os.getenv("API_KEY")
DB_URL = os.getenv("DB_URL")
# print(DB_URL)

In [18]:

coin_list = ['cardano', 'polkadot', 'chainlink', 'litecoin', 'uniswap', 'stellar', 'aptos', 'ripple', 'avalanche-2']
name_list = []
price_list = []
cap_list = []
volume_list = []
time_list = []

for coin in coin_list: # loop thru the list of coins and extract its data
    url = f"https://api.coingecko.com/api/v3/coins/{coin}?localization=false&tickers=false&community_data=false&developer_data=false&sparkline=false&dex_pair_format=symbol"

    headers = {
        "accept": "application/json",
        "x-cg-demo-api-key": API_KEY
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()

        name_list.append(data["name"])
        price_list.append(data["market_data"]['current_price']['usd'])
        cap_list.append(data['market_data']['market_cap']['usd'])
        volume_list.append(data['market_data']['total_volume']['usd'])
        time_list.append(data['market_data']['last_updated'])

        coin_data = {
            'name': name_list,
            'current_price': price_list,
            'market_cap': cap_list,
            'total_volume': volume_list,
            'time': time_list
        }
    else:
        print(f"Requests error: {response.status_code}, {response.text}")

coin_data

{'name': ['Cardano',
  'Polkadot',
  'Chainlink',
  'Litecoin',
  'Uniswap',
  'Stellar',
  'Aptos',
  'XRP',
  'Avalanche'],
 'current_price': [0.754237,
  4.57,
  15.79,
  96.95,
  7.08,
  0.287121,
  5.38,
  2.3,
  23.36],
 'market_cap': [27215574896,
  6961805141,
  10385077811,
  7360464834,
  4253447103,
  8942277430,
  3410742190,
  134890566204,
  9840049994],
 'total_volume': [646237408,
  238531629,
  461596206,
  481732607,
  931422459,
  199953721,
  203865205,
  2331388602,
  516156517],
 'time': ['2025-05-29T12:23:29.906Z',
  '2025-05-29T12:23:30.495Z',
  '2025-05-29T12:23:26.248Z',
  '2025-05-29T12:23:30.767Z',
  '2025-05-29T12:23:33.053Z',
  '2025-05-29T12:23:29.745Z',
  '2025-05-29T12:23:23.993Z',
  '2025-05-29T12:23:30.155Z',
  '2025-05-29T12:23:29.193Z']}

In [19]:
df = pd.DataFrame(coin_data)
df.head()

Unnamed: 0,name,current_price,market_cap,total_volume,time
0,Cardano,0.754237,27215574896,646237408,2025-05-29T12:23:29.906Z
1,Polkadot,4.57,6961805141,238531629,2025-05-29T12:23:30.495Z
2,Chainlink,15.79,10385077811,461596206,2025-05-29T12:23:26.248Z
3,Litecoin,96.95,7360464834,481732607,2025-05-29T12:23:30.767Z
4,Uniswap,7.08,4253447103,931422459,2025-05-29T12:23:33.053Z


In [21]:
# let's set time column as a pd.to_datetime obj
df['time'] = pd.to_datetime(df['time'], utc=True)
df.head()

Unnamed: 0,name,current_price,market_cap,total_volume,time
0,Cardano,0.754237,27215574896,646237408,2025-05-29 12:23:29.906000+00:00
1,Polkadot,4.57,6961805141,238531629,2025-05-29 12:23:30.495000+00:00
2,Chainlink,15.79,10385077811,461596206,2025-05-29 12:23:26.248000+00:00
3,Litecoin,96.95,7360464834,481732607,2025-05-29 12:23:30.767000+00:00
4,Uniswap,7.08,4253447103,931422459,2025-05-29 12:23:33.053000+00:00


In [None]:
# set the time column as the index for time-series analysis
df.set_index('time', inplace=True)
df.head()

Unnamed: 0_level_0,name,current_price,market_cap,total_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-05-29 12:23:29.906000+00:00,Cardano,0.754237,27215574896,646237408
2025-05-29 12:23:30.495000+00:00,Polkadot,4.57,6961805141,238531629
2025-05-29 12:23:26.248000+00:00,Chainlink,15.79,10385077811,461596206
2025-05-29 12:23:30.767000+00:00,Litecoin,96.95,7360464834,481732607
2025-05-29 12:23:33.053000+00:00,Uniswap,7.08,4253447103,931422459


In [24]:
# checking the columns dtype
df.dtypes

name              object
current_price    float64
market_cap         int64
total_volume       int64
dtype: object

In [27]:
# the dataset is clean, loading it into a mock table in the public schema
engine = create_engine(DB_URL)
try:
    df.to_sql(name='mock_crypto_data', con=engine, schema='public', index=False)
    print("Data loaded successfully")
except Exception as e:
    print(f"Loading data: {e}")


Data loaded successfully


### Test ETL complete.
 Airflow can now be initialized using this info and hourly DAG for this ETL pipeline be set up.
 Tasks are:
- extract_data: extracting data from API by looping through the list of coin IDs
- transform_data: cleans data by setting time into a pd.to_datetime obj for time series analysis, setting time as the index and checking for column datatypes
- load_to_db: loads the transformed dataframe into a Postgres DB whose port != 5432. Change this in the postgresql.conf file manually.
