# Data download

**Inputs:**

- Configuration file

**Steps:**

- Load configuration file
- Download data from source
- Transform into desired format
- Store data

**Outputs:**

- Raw data

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from src.utils import load_config

config_path = Path.cwd() / "config.yaml"
config = load_config(config_path)

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
data_dir = Path.cwd().parent / "data"
print(data_dir.exists())

True


## Step 1: Load and transform data

In [5]:
from src.data.io import load_dataframes

tx_df = load_dataframes(directory=data_dir / "raw", sort_by="tx_datetime")

print(tx_df.shape)

(1754155, 9)


In [6]:
import pandas as pd

cust_df = pd.read_csv(data_dir / "raw/customer_profiles_table.csv")

print(cust_df.shape)

(5000, 8)


In [7]:
term_df = pd.read_csv(data_dir / "raw/terminal_profiles_table.csv")

print(term_df.shape)

(10000, 3)


In [8]:
proc_dir = data_dir / "processed"
proc_dir.mkdir(parents=True, exist_ok=True)

In [9]:
tx_df.to_csv(proc_dir / "transactions.csv", index=False)
cust_df.to_csv(proc_dir / "customers.csv", index=False)
term_df.to_csv(proc_dir / "terminals.csv", index=False)

## Step 3: Upload data to artifact store

In [10]:
import wandb

In [11]:
import os

wandb.login(key=os.environ["WANDB_API_KEY"], relogin=True, host="https://api.wandb.ai")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/fpe/.netrc


True

In [12]:
run = wandb.init(project="fraud-detection", job_type="data_download")

[34m[1mwandb[0m: Currently logged in as: [33mfelixpeters[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
artifact = wandb.Artifact(name='raw_data', type='dataset')
artifact.add_dir(str(proc_dir))

[34m[1mwandb[0m: Adding directory to artifact (/Users/fpe/code/ml/fraud-detection/data/processed)... Done. 0.3s


In [14]:
run.log_artifact(artifact)

<Artifact raw_data>

In [15]:
run.finish()

