# Demo notebook cleaning scraped data

From the website https://www.coingecko.com data was scraped. The data are all strings, due to the scraping and contains some unwanted characters. This notebook demonstrates some regular cleaning steps

In [56]:
import pandas as pd
import numpy as np

In [69]:
#read the file
colnames = ['x','Coin', 'Price','1h','24h','7d', '24Volume', 'Mkt Cap', 'FDV', 'MarktcapFDV', 'x3']
df = pd.read_csv('../Coin.csv', names=colnames, skiprows=1).reset_index()

In [70]:
#check the file
df.head()

Unnamed: 0,level_0,level_1,x,Coin,Price,1h,24h,7d,24Volume,Mkt Cap,FDV,MarktcapFDV,x3
0,0,,1,Bitcoin BTC,"$28,919.33",0.0%,-1.5%,-3.3%,"$23,383,926,918","$559,386,921,097","$607,040,903,953",0.92,
1,1,,2,Ethereum ETH,"$1,954.65",0.2%,-2.4%,1.8%,"$15,145,886,450","$235,267,461,213","$235,267,461,213",1.0,
2,2,,3,Tether USDT,$1.00,-0.1%,-0.4%,-0.1%,"$39,351,843,271","$81,374,101,359","$81,374,101,359",1.0,
3,3,,4,BNB BNB,$325.55,0.1%,-1.2%,1.6%,"$926,550,242","$51,369,472,670","$65,071,483,944",0.79,
4,4,,5,USD Coin USDC,$0.999955,-0.0%,-0.5%,0.0%,"$5,052,260,808","$31,248,083,356","$31,248,083,356",1.0,


In [71]:
#remove colums
df = df.iloc[:,3:-1]

In [72]:
#check datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Coin         100 non-null    object
 1   Price        100 non-null    object
 2   1h           100 non-null    object
 3   24h          100 non-null    object
 4   7d           100 non-null    object
 5   24Volume     100 non-null    object
 6   Mkt Cap      100 non-null    object
 7   FDV          100 non-null    object
 8   MarktcapFDV  100 non-null    object
dtypes: object(9)
memory usage: 7.2+ KB


In [73]:
# if we want to change the objects to floats it gives an error. We need to remove the $ and % and , signs
# df.iloc[:,1:].astype(float)

In [74]:
# we can use regular expressions to replance check regex101.com
df = df.replace('[\$\%,]', '', regex=True)

In [75]:
df.head()

Unnamed: 0,Coin,Price,1h,24h,7d,24Volume,Mkt Cap,FDV,MarktcapFDV
0,Bitcoin BTC,28919.33,0.0,-1.5,-3.3,23383926918,559386921097,607040903953,0.92
1,Ethereum ETH,1954.65,0.2,-2.4,1.8,15145886450,235267461213,235267461213,1.0
2,Tether USDT,1.0,-0.1,-0.4,-0.1,39351843271,81374101359,81374101359,1.0
3,BNB BNB,325.55,0.1,-1.2,1.6,926550242,51369472670,65071483944,0.79
4,USD Coin USDC,0.999955,-0.0,-0.5,0.0,5052260808,31248083356,31248083356,1.0


In [77]:
# we still have - signs. This gives an error if we want to change the datatype
# df.iloc[:,1:].astype(float) # gives error


#complicated way to approach this 
#create a list (mask) of true false for condition
m_neg = df['1h'].str.contains('-')
m_neg

0     False
1     False
2      True
3     False
4      True
      ...  
95     True
96     True
97    False
98    False
99    False
Name: 1h, Length: 100, dtype: bool

In [78]:
#remove the - sign
df['1h'] = df['1h'].str.replace('-', '')
df.head()

Unnamed: 0,Coin,Price,1h,24h,7d,24Volume,Mkt Cap,FDV,MarktcapFDV
0,Bitcoin BTC,28919.33,0.0,-1.5,-3.3,23383926918,559386921097,607040903953,0.92
1,Ethereum ETH,1954.65,0.2,-2.4,1.8,15145886450,235267461213,235267461213,1.0
2,Tether USDT,1.0,0.1,-0.4,-0.1,39351843271,81374101359,81374101359,1.0
3,BNB BNB,325.55,0.1,-1.2,1.6,926550242,51369472670,65071483944,0.79
4,USD Coin USDC,0.999955,0.0,-0.5,0.0,5052260808,31248083356,31248083356,1.0


In [80]:
#type cast the object
df['1h'] = df['1h'].astype(float)
# on the True locations, multiply with -1
df.loc[m_neg, "1h"] *= -1 
df.head()

Unnamed: 0,Coin,Price,1h,24h,7d,24Volume,Mkt Cap,FDV,MarktcapFDV
0,Bitcoin BTC,28919.33,0.0,-1.5,-3.3,23383926918,559386921097,607040903953,0.92
1,Ethereum ETH,1954.65,0.2,-2.4,1.8,15145886450,235267461213,235267461213,1.0
2,Tether USDT,1.0,-0.1,-0.4,-0.1,39351843271,81374101359,81374101359,1.0
3,BNB BNB,325.55,0.1,-1.2,1.6,926550242,51369472670,65071483944,0.79
4,USD Coin USDC,0.999955,-0.0,-0.5,0.0,5052260808,31248083356,31248083356,1.0


In [81]:
# simple solution
# if we use errors='ignore' it does not give an error 
df.iloc[:,1:] = df.iloc[:,1:].astype(float, errors='ignore') # no error
df.head()

Unnamed: 0,Coin,Price,1h,24h,7d,24Volume,Mkt Cap,FDV,MarktcapFDV
0,Bitcoin BTC,28919.33,0.0,-1.5,-3.3,23383930000.0,559386900000.0,607040903953,0.92
1,Ethereum ETH,1954.65,0.2,-2.4,1.8,15145890000.0,235267500000.0,235267461213,1.0
2,Tether USDT,1.0,-0.1,-0.4,-0.1,39351840000.0,81374100000.0,81374101359,1.0
3,BNB BNB,325.55,0.1,-1.2,1.6,926550200.0,51369470000.0,65071483944,0.79
4,USD Coin USDC,0.999955,-0.0,-0.5,0.0,5052261000.0,31248080000.0,31248083356,1.0


In [82]:
#check if succeeded
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Coin         100 non-null    object 
 1   Price        100 non-null    float64
 2   1h           100 non-null    float64
 3   24h          100 non-null    float64
 4   7d           100 non-null    float64
 5   24Volume     100 non-null    float64
 6   Mkt Cap      100 non-null    float64
 7   FDV          100 non-null    object 
 8   MarktcapFDV  100 non-null    object 
dtypes: float64(6), object(3)
memory usage: 7.2+ KB


In [83]:
#something is in the column which blocks the typecasting
df['FDV'].value_counts()

-               22
607040903953     1
2285509216       1
1071118974       1
1176348369       1
                ..
11357733069      1
2640177849       1
4898742428       1
2950243064       1
515912447        1
Name: FDV, Length: 79, dtype: int64

In [None]:
# it is the - sign (22 times)

In [86]:
# remove everything which is not a number, or a dot, replace it with NaN and then make it a float
df[['FDV', 'MarktcapFDV']] = df[['FDV', 'MarktcapFDV']].replace('[^0-9.]+', np.nan, regex=True).astype(float)

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Coin         100 non-null    object 
 1   Price        100 non-null    float64
 2   1h           100 non-null    float64
 3   24h          100 non-null    float64
 4   7d           100 non-null    float64
 5   24Volume     100 non-null    float64
 6   Mkt Cap      100 non-null    float64
 7   FDV          78 non-null     float64
 8   MarktcapFDV  78 non-null     float64
dtypes: float64(8), object(1)
memory usage: 7.2+ KB


In [88]:
df.head(10) 

Unnamed: 0,Coin,Price,1h,24h,7d,24Volume,Mkt Cap,FDV,MarktcapFDV
0,Bitcoin BTC,28919.33,0.0,-1.5,-3.3,23383930000.0,559386900000.0,607040900000.0,0.92
1,Ethereum ETH,1954.65,0.2,-2.4,1.8,15145890000.0,235267500000.0,235267500000.0,1.0
2,Tether USDT,1.0,-0.1,-0.4,-0.1,39351840000.0,81374100000.0,81374100000.0,1.0
3,BNB BNB,325.55,0.1,-1.2,1.6,926550200.0,51369470000.0,65071480000.0,0.79
4,USD Coin USDC,0.999955,-0.0,-0.5,0.0,5052261000.0,31248080000.0,31248080000.0,1.0
5,XRP XRP,0.495314,0.0,0.1,-2.0,1889694000.0,25599990000.0,49467800000.0,0.52
6,Cardano ADA,0.419258,0.0,1.1,3.4,541454100.0,14675470000.0,18844220000.0,0.78
7,Dogecoin DOGE,0.091924,0.1,4.1,10.1,1807889000.0,12807510000.0,,
8,Lido Staked Ether STETH,1950.49,0.1,-2.7,2.0,37365760.0,11833950000.0,11833950000.0,1.0
9,Polygon MATIC,1.09,0.1,-1.9,-1.1,509022900.0,10059490000.0,10911140000.0,0.92
