### Crypto Data API pipeline
- Get data from CoinGecko API
- Transform and get the data we need from the whole JSON set
- Load into a db that isn't on port 5432, change in postgresql.conf file where port = 5432 to port = 8000 or sth else


#### 1. Extract data from API

In [8]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()
API_KEY = os.getenv("API_KEY")
DB_URL = os.getenv("DB_URL")
#print(DB_URL)

In [2]:

coin_list = ['cardano', 'polkadot', 'chainlink', 'litecoin', 'uniswap', 'stellar', 'aptos', 'ripple', 'avalanche-2']
name_list = []
price_list = []
cap_list = []
volume_list = []
time_list = []

for coin in coin_list: # loop thru the list of coins and extract its data
    url = f"https://api.coingecko.com/api/v3/coins/{coin}?localization=false&tickers=false&community_data=false&developer_data=false&sparkline=false&dex_pair_format=symbol"

    headers = {
        "accept": "application/json",
        "x-cg-demo-api-key": API_KEY
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()

        name_list.append(data["name"])
        price_list.append(data["market_data"]['current_price']['usd'])
        cap_list.append(data['market_data']['market_cap']['usd'])
        volume_list.append(data['market_data']['total_volume']['usd'])
        time_list.append(data['market_data']['last_updated'])

        coin_data = {
            'name': name_list,
            'current_price': price_list,
            'market_cap': cap_list,
            'total_volume': volume_list,
            'time': time_list
        }
    else:
        print(f"Requests error: {response.status_code}, {response.text}")

coin_data

{'name': ['Cardano',
  'Polkadot',
  'Chainlink',
  'Litecoin',
  'Uniswap',
  'Stellar',
  'Aptos',
  'XRP',
  'Avalanche'],
 'current_price': [0.694879,
  4.12,
  14.05,
  87.48,
  6.36,
  0.268855,
  4.89,
  2.17,
  21.01],
 'market_cap': [25091666417,
  6267094976,
  9239642406,
  6643866008,
  3819015617,
  8378514234,
  3086652019,
  127379554855,
  8851268198],
 'total_volume': [882332524,
  298362880,
  695995871,
  596253828,
  666387404,
  267512181,
  289965279,
  3318019497,
  647406920],
 'time': ['2025-05-30T17:50:51.506Z',
  '2025-05-30T17:51:00.737Z',
  '2025-05-30T17:50:57.504Z',
  '2025-05-30T17:51:01.177Z',
  '2025-05-30T17:50:54.297Z',
  '2025-05-30T17:51:00.936Z',
  '2025-05-30T17:51:03.663Z',
  '2025-05-30T17:51:02.622Z',
  '2025-05-30T17:51:00.098Z']}

In [3]:
df = pd.DataFrame(coin_data)
df.head()

Unnamed: 0,name,current_price,market_cap,total_volume,time
0,Cardano,0.694879,25091666417,882332524,2025-05-30T17:50:51.506Z
1,Polkadot,4.12,6267094976,298362880,2025-05-30T17:51:00.737Z
2,Chainlink,14.05,9239642406,695995871,2025-05-30T17:50:57.504Z
3,Litecoin,87.48,6643866008,596253828,2025-05-30T17:51:01.177Z
4,Uniswap,6.36,3819015617,666387404,2025-05-30T17:50:54.297Z


In [4]:
# let's set time column as a pd.to_datetime obj
df['time'] = pd.to_datetime(df['time'], utc=True)
df.head()

Unnamed: 0,name,current_price,market_cap,total_volume,time
0,Cardano,0.694879,25091666417,882332524,2025-05-30 17:50:51.506000+00:00
1,Polkadot,4.12,6267094976,298362880,2025-05-30 17:51:00.737000+00:00
2,Chainlink,14.05,9239642406,695995871,2025-05-30 17:50:57.504000+00:00
3,Litecoin,87.48,6643866008,596253828,2025-05-30 17:51:01.177000+00:00
4,Uniswap,6.36,3819015617,666387404,2025-05-30 17:50:54.297000+00:00


In [5]:
# set the time column as the index for time-series analysis
#f.set_index('time', inplace=True)
df.head()

Unnamed: 0,name,current_price,market_cap,total_volume,time
0,Cardano,0.694879,25091666417,882332524,2025-05-30 17:50:51.506000+00:00
1,Polkadot,4.12,6267094976,298362880,2025-05-30 17:51:00.737000+00:00
2,Chainlink,14.05,9239642406,695995871,2025-05-30 17:50:57.504000+00:00
3,Litecoin,87.48,6643866008,596253828,2025-05-30 17:51:01.177000+00:00
4,Uniswap,6.36,3819015617,666387404,2025-05-30 17:50:54.297000+00:00


In [6]:
# checking the columns dtype
df.index = df.index + 1
df.dtypes

name                          object
current_price                float64
market_cap                     int64
total_volume                   int64
time             datetime64[ns, UTC]
dtype: object

In [7]:
# the dataset is clean, loading it into a mock table in the public schema
engine = create_engine(DB_URL)
connection = engine.raw_connection()
try:
    df.to_sql(name='mock_crypto_data', con=engine, schema='public')
    print("Data loaded successfully")
except Exception as e:
    print(f"Loading data: {e}")


Data loaded successfully


### Test ETL complete.
 Airflow can now be initialized using this info and hourly DAG for this ETL pipeline be set up.
 Tasks are:
- extract_data: extracting data from API by looping through the list of coin IDs
- transform_data: cleans data by setting time into a pd.to_datetime obj for time series analysis, setting time as the index and checking for column datatypes
- load_to_db: loads the transformed dataframe into a Postgres DB whose port != 5432. Change this in the postgresql.conf file manually.
