# source: [Ethereum Historical data](https://www.kaggle.com/datasets/themrityunjaypathak/ethereum-historical-data)

### Lib imports 

In [71]:
import pandas as pd 
import re 
import numpy as np



In [74]:
url = f"https://raw.githubusercontent.com/dsirepos/data/main/ethereum_price.csv"
df = pd.read_csv(url)

In [75]:
# data preview 
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,"Mar 08, 2023",1553.49,1561.79,1569.7,1548.98,498.57K,-0.53%
1,"Mar 07, 2023",1561.78,1565.84,1580.95,1536.31,460.10K,-0.26%
2,"Mar 06, 2023",1565.84,1564.36,1581.13,1555.43,322.16K,0.09%
3,"Mar 05, 2023",1564.37,1566.73,1587.95,1556.84,313.01K,-0.15%
4,"Mar 04, 2023",1566.73,1569.45,1577.02,1550.1,247.02K,-0.14%


In [76]:
# info about data 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2555 entries, 0 to 2554
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      2555 non-null   object
 1   Price     2555 non-null   object
 2   Open      2555 non-null   object
 3   High      2555 non-null   object
 4   Low       2555 non-null   object
 5   Vol.      2555 non-null   object
 6   Change %  2555 non-null   object
dtypes: object(7)
memory usage: 139.9+ KB


## Data Pre-processing 

In [77]:
# column name Fix  : vol. , change %

df.columns= [x.lower().strip().replace(' ', '_') for x in df.columns]

df = df.rename(columns={'vol.' : 'volume', "change_%" : "percentage_change"})
df.columns

Index(['date', 'price', 'open', 'high', 'low', 'volume', 'percentage_change'], dtype='object')

In [78]:
# dype change 

df['date'] = pd.to_datetime(df.date)

df.head()

Unnamed: 0,date,price,open,high,low,volume,percentage_change
0,2023-03-08,1553.49,1561.79,1569.7,1548.98,498.57K,-0.53%
1,2023-03-07,1561.78,1565.84,1580.95,1536.31,460.10K,-0.26%
2,2023-03-06,1565.84,1564.36,1581.13,1555.43,322.16K,0.09%
3,2023-03-05,1564.37,1566.73,1587.95,1556.84,313.01K,-0.15%
4,2023-03-04,1566.73,1569.45,1577.02,1550.1,247.02K,-0.14%


In [79]:
df['price'] = df['price'].str.replace(',' , '')
df['open'] = df['open'].str.replace(',', '')
df['high'] = df['high'].str.replace(',', '')
df['low'] = df['low'].str.replace(',', '')

df['percentage_change'] = df['percentage_change'].str.replace('%', '')

def normalize_volume(vol):
    if vol[-1] == 'K':
        return float(vol[:-1]) * 1000
    elif vol[-1] == 'M':
        return float(vol[:-1]) * 1_000_000
    elif vol[-1] == 'B':
        return float(vol[:-1]) * 1_000_000_000
    elif vol == '-':
        return np.NaN
    else:
        return float(vol)


df['volume'] = df['volume'].apply(normalize_volume)



df.head()

Unnamed: 0,date,price,open,high,low,volume,percentage_change
0,2023-03-08,1553.49,1561.79,1569.7,1548.98,498570.0,-0.53
1,2023-03-07,1561.78,1565.84,1580.95,1536.31,460100.0,-0.26
2,2023-03-06,1565.84,1564.36,1581.13,1555.43,322160.0,0.09
3,2023-03-05,1564.37,1566.73,1587.95,1556.84,313010.0,-0.15
4,2023-03-04,1566.73,1569.45,1577.02,1550.1,247020.0,-0.14


In [69]:
df.isna().sum()

date                 0
price                0
open                 0
high                 0
low                  0
volume               0
percentage_change    0
volume_normalized    8
dtype: int64

### Missing data check 