# Crypto currency price history

## Get started

> **Note**: this notebook uses python 3 as kernel

This notebook assumes the data is already downloaded and stored at ../data/raw

if not, execute the next:

`python ../src/data/get_dataset.py ../data/raw`

## 1. Read data

In [4]:
from pathlib import Path
import pandas as pd
import pickle as pkl

In [2]:
def read_data():
    REL_PATH = Path().glob("../data/raw/cryptocurrencypricehistory/*.csv") # working with .csv files
    
    # two kind of files: datasets and price (classification based on its inner data and structure)
    data = dict() # init data structure
    
    for file in REL_PATH:
        df = pd.read_csv(file)
        filename, kind = Path(file).stem.rsplit("_", 1) # based on the .csv filenames format
        
        df["Crypto"] = pd.Series([filename] * df.shape[0], dtype=pd.StringDtype()) # to identify each record
        data.setdefault(kind, list()).append(df) # data['price'].append(df) if kind == 'price' else data['datasets'].append(df)
        
        pkl.dump(data, open('../data/interim/crypto.pkl', 'wb'), protocol=pkl.HIGHEST_PROTOCOL) # save read data
    
    return data

data = pkl.load(open('../data/interim/crypto.pkl', 'rb')) if Path("../data/interim/crypto.pkl").is_file() else read_data()

In [3]:
data['price'][0].info() # get first feedback from the data structure
data['price'][0].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        45 non-null     object 
 1   Open        45 non-null     float64
 2   High        45 non-null     float64
 3   Low         45 non-null     float64
 4   Close       45 non-null     float64
 5   Volume      45 non-null     object 
 6   Market Cap  45 non-null     object 
 7   Crypto      45 non-null     string 
dtypes: float64(4), object(3), string(1)
memory usage: 2.9+ KB


Unnamed: 0,Date,Open,High,Low,Close,Volume,Market Cap,Crypto
0,"Sep 05, 2017",514.9,550.95,458.78,541.71,338978000,8527100000,bitcoin_cash
1,"Sep 04, 2017",608.26,608.26,500.75,517.24,328957000,10072200000,bitcoin_cash
2,"Sep 03, 2017",578.27,617.41,563.59,607.43,344862000,9574520000,bitcoin_cash
3,"Sep 02, 2017",621.96,642.05,560.58,575.9,350478000,10297000000,bitcoin_cash
4,"Sep 01, 2017",588.4,645.52,586.73,622.17,393839000,9740460000,bitcoin_cash


## 2. Clean Data

In [None]:
pd.options.mode.chained_assignment = None  # avoid default='warn'

def parse_num(x):
    return x.replace(',','') if type(x) is str else x

def clean(df):
    df.Date = pd.to_datetime(df.Date, format="%b %d, %Y")
    df = df.loc[(df["Market Cap"] != '-') & (df["Volume"] != '-')]
    df.Volume = df.Volume.apply(lambda x: parse_num(x)).astype('int64')
    df["Market Cap"] = df["Market Cap"].apply(lambda x: parse_num(x)).astype('int64')
    return df

data['price'] = list(map(clean, data['price']))
data['price'][0].info()


### 2.2 Generate Profile Report
As we cleaned the data just by observing the dataframes format, then we want to get deeper running a complete report
Better way to get more information

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

REP_PATH = "../reports/"

In [None]:
def report(df, output, sub_dir=""):
    profile = ProfileReport(df, title="Pandas Profiling Report {}".format(output), explorative=False, progress_bar=False)
    profile.to_widgets()
    
    directory = Path(REP_PATH + sub_dir).joinpath(output)
    directory.mkdir(parents=True, exist_ok=True)
    profile.to_file(directory.joinpath(output + ".html"))
    profile.to_file(directory.joinpath(output + ".json"))

[report(df, df['Crypto'][0], "price") for df in data['price']]


In [None]:
dff = pd.concat(data['price'], axis=0) # as the tables have the same format and relates to the same info

In [None]:
sns_plot = sns.pairplot(dff, hue="Crypto")
sns_plot.savefig(REP_PATH + "figures/" + "crypto.png")

In [None]:
fig, ax = plt.subplots()
for key, grp in dff.groupby(['Crypto']):
    ax = grp.plot(ax=ax, kind='line', x='Date', y='Close', label=key)

ax.figure.savefig(REP_PATH + "figures/" + "crypto2.png")

In [None]:
df_bitcoin = data['dataset'][0]
df_bitcoin.info()
df_bitcoin.head()

In [None]:
df_bitcoin.Date = pd.to_datetime(df_bitcoin.Date, format = "%Y-%m-%d %H:%M:%S")
df_bitcoin = df_bitcoin.rename(columns={'btc_market_price': 'Price'})
df_bitcoin.isna().sum()
df_bitcoin = df_bitcoin.fillna(0)

data['dataset'][0] = df_bitcoin
data['dataset'][0].head()

In [None]:
df_ethereum = data['dataset'][1]
df_ethereum.info()
df_ethereum.head()

In [None]:
df_ethereum = df_ethereum.rename(columns={'Date(UTC)': 'Date'})
df_ethereum = df_ethereum.rename(columns={'eth_etherprice': 'Price'})
df_ethereum.Date = pd.to_datetime(df_ethereum.Date, format = "%m/%d/%Y")
df_ethereum.isna().sum()
df_ethereum = df_ethereum.drop('eth_chaindatasize', 1)
df_ethereum = df_ethereum.fillna(0)

data['dataset'][1] = df_ethereum
data['dataset'][1].info()

In [None]:
[report(df, df['Crypto'][0], "dataset") for df in data['dataset']]

In [None]:
dff = pd.concat(data['dataset'], axis=0) # sha de canviar
dff.info()
dff.head()

In [None]:
fig, ax = plt.subplots()
for key, grp in dff.groupby(['Crypto']):
    ax = grp.plot(ax=ax, kind='line', x='Date', y='Price', label=key)

ax.figure.savefig(REP_PATH + "figures/" + "price_comparison_btc_eth.png")

In [None]:
pkl.dump(data, open('../data/processed/crypto.pkl', 'wb'), protocol=pkl.HIGHEST_PROTOCOL) # save cleaned data