# Pandas Data Cleaning

Some methods: `.isnull()`, `.drop()`, `.fillna()`, `._get_numeric_data()`, `.astype()`, `.str.replace()`, `.drop_duplicates()`, `.apply()`

---

In [1]:
import pandas as pd

---

### Working with `null` values

__NOTE:__ `NaN` is an alias for `null` in Pandas. `NaN` is used to represent missing or undefined numeric data in Pandas, while `None` should be, theoretically, used to represent missing or undefined non-numeric data. However, while `NaN` may not always be a meaningful representation of missing or undefined non-numeric data, it is a convenient and efficient way to represent missing or undefined data in a unified way across all data types in Pandas.

In [2]:
# https://www.kaggle.com/vagifa/ethereum-frauddetection-dataset


df = pd.read_csv('datasets/transaction_dataset.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx                                          9841

In [5]:
df.isnull().head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
df.isna().head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
nan_cols = df.isna().sum()

In [10]:
nan_cols

Unnamed: 0                                                0
Index                                                     0
Address                                                   0
FLAG                                                      0
Avg min between sent tnx                                  0
Avg min between received tnx                              0
Time Diff between first and last (Mins)                   0
Sent tnx                                                  0
Received Tnx                                              0
Number of Created Contracts                               0
Unique Received From Addresses                            0
Unique Sent To Addresses                                  0
min value received                                        0
max value received                                        0
avg val received                                          0
min val sent                                              0
max val sent                            

In [9]:
nan_cols[nan_cols>0]  # valores nulos por columna

 Total ERC20 tnxs                       829
 ERC20 total Ether received             829
 ERC20 total ether sent                 829
 ERC20 total Ether sent contract        829
 ERC20 uniq sent addr                   829
 ERC20 uniq rec addr                    829
 ERC20 uniq sent addr.1                 829
 ERC20 uniq rec contract addr           829
 ERC20 avg time between sent tnx        829
 ERC20 avg time between rec tnx         829
 ERC20 avg time between rec 2 tnx       829
 ERC20 avg time between contract tnx    829
 ERC20 min val rec                      829
 ERC20 max val rec                      829
 ERC20 avg val rec                      829
 ERC20 min val sent                     829
 ERC20 max val sent                     829
 ERC20 avg val sent                     829
 ERC20 min val sent contract            829
 ERC20 max val sent contract            829
 ERC20 avg val sent contract            829
 ERC20 uniq sent token name             829
 ERC20 uniq rec token name      

---

In [12]:
nan_cols = nan_cols[nan_cols>0] / len(df) * 100


nan_cols # valores nulos por columna en porcentaje

 Total ERC20 tnxs                       8.423941
 ERC20 total Ether received             8.423941
 ERC20 total ether sent                 8.423941
 ERC20 total Ether sent contract        8.423941
 ERC20 uniq sent addr                   8.423941
 ERC20 uniq rec addr                    8.423941
 ERC20 uniq sent addr.1                 8.423941
 ERC20 uniq rec contract addr           8.423941
 ERC20 avg time between sent tnx        8.423941
 ERC20 avg time between rec tnx         8.423941
 ERC20 avg time between rec 2 tnx       8.423941
 ERC20 avg time between contract tnx    8.423941
 ERC20 min val rec                      8.423941
 ERC20 max val rec                      8.423941
 ERC20 avg val rec                      8.423941
 ERC20 min val sent                     8.423941
 ERC20 max val sent                     8.423941
 ERC20 avg val sent                     8.423941
 ERC20 min val sent contract            8.423941
 ERC20 max val sent contract            8.423941
 ERC20 avg val sent 

---

#### Droping columns

In [14]:
nan_cols.index

Index([' Total ERC20 tnxs', ' ERC20 total Ether received',
       ' ERC20 total ether sent', ' ERC20 total Ether sent contract',
       ' ERC20 uniq sent addr', ' ERC20 uniq rec addr',
       ' ERC20 uniq sent addr.1', ' ERC20 uniq rec contract addr',
       ' ERC20 avg time between sent tnx', ' ERC20 avg time between rec tnx',
       ' ERC20 avg time between rec 2 tnx',
       ' ERC20 avg time between contract tnx', ' ERC20 min val rec',
       ' ERC20 max val rec', ' ERC20 avg val rec', ' ERC20 min val sent',
       ' ERC20 max val sent', ' ERC20 avg val sent',
       ' ERC20 min val sent contract', ' ERC20 max val sent contract',
       ' ERC20 avg val sent contract', ' ERC20 uniq sent token name',
       ' ERC20 uniq rec token name', ' ERC20 most sent token type',
       ' ERC20_most_rec_token_type'],
      dtype='object')

In [16]:
len(nan_cols.index)

25

In [17]:
df.drop(columns = nan_cols.index, inplace=True)

In [19]:
nan_cols = df.isna().sum()

nan_cols[nan_cols>0]

Series([], dtype: int64)

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,max val sent,avg val sent,min value sent to contract,max val sent to contract,avg value sent to contract,total transactions (including tnx to create contract,total Ether sent,total ether received,total ether sent contracts,total ether balance
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,31.22,1.200681,0.0,0.0,0.0,810,865.691093,586.466675,0.0,-279.224419
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,1.8,0.032844,0.0,0.0,0.0,102,3.087297,3.085478,0.0,-0.001819
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,...,3.538616,1.794308,0.0,0.0,0.0,12,3.588616,3.589057,0.0,0.000441
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,...,450.0,70.001834,0.0,0.0,0.0,34,1750.045862,895.399559,0.0,-854.646303
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,9.0,0.022688,0.0,0.0,0.0,4619,104.318883,53.421897,0.0,-50.896986


---

#### Filling nulls

In [22]:
pd.set_option('display.max_columns', None)

df = pd.read_csv('datasets/vehicles.csv')

df.head()

  df = pd.read_csv('datasets/vehicles.csv')


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0.0,0.0,0.0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0,0.0,19.0,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0.0,0.0,0.0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0,0.0,47.0,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0.0,0.0,0.0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0.0,0.0,0.0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0,0.0,32.0,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0.0,0.0,0.0


> __But first, let's do some serious Slicing!!!__

In [23]:
nan_cols = df.isna().sum()

nan_cols[nan_cols>0]

cylinders        38
displ            37
drive          1189
eng_dscr      13312
trany            11
guzzler       23896
trans_dscr    13478
tCharger      22688
sCharger      25068
atvType       23543
fuelType2     24516
rangeA        24521
evMotor       25240
mfrCode       23397
c240Dscr      25479
c240bDscr     25479
startStop     24277
phevCity          1
phevHwy           1
phevComb          1
dtype: int64

In [27]:
# dame las filas donde cylinders es nulo, y luego las columnas ['make', 'cylinders', 'displ', 'fuelType']

df.loc[df['cylinders'].isna(), ['make', 'model', 'cylinders', 'displ', 'fuelType']]

Unnamed: 0,make,model,cylinders,displ,fuelType
7138,Nissan,Altra EV,,,Electricity
7139,Toyota,RAV4 EV,,,Electricity
8143,Toyota,RAV4 EV,,,Electricity
8144,Ford,Th!nk,,,Electricity
8146,Ford,Explorer USPS Electric,,,Electricity
8147,Nissan,Hyper-Mini,,,Electricity
9212,Toyota,RAV4 EV,,,Electricity
9213,Ford,Explorer USPS Electric,,,Electricity
10329,Toyota,RAV4 EV,,,Electricity
21413,Subaru,RX Turbo,,,Regular


In [26]:
df[['make', 'cylinders', 'displ', 'fuelType']].head()

Unnamed: 0,make,cylinders,displ,fuelType
0,Alfa Romeo,4.0,2.0,Regular
1,Ferrari,12.0,4.9,Regular
2,Dodge,4.0,2.2,Regular
3,Dodge,8.0,5.2,Regular
4,Subaru,4.0,2.2,Premium


In [33]:
df[['cylinders', 'displ']] = df[['cylinders', 'displ']].fillna(0)  # rellena con un valor

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25479 entries, 0 to 25478
Data columns (total 83 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        25479 non-null  float64
 1   barrelsA08       25479 non-null  float64
 2   charge120        25479 non-null  float64
 3   charge240        25479 non-null  float64
 4   city08           25479 non-null  int64  
 5   city08U          25479 non-null  float64
 6   cityA08          25479 non-null  int64  
 7   cityA08U         25479 non-null  float64
 8   cityCD           25479 non-null  float64
 9   cityE            25479 non-null  float64
 10  cityUF           25479 non-null  float64
 11  co2              25479 non-null  int64  
 12  co2A             25479 non-null  int64  
 13  co2TailpipeAGpm  25479 non-null  float64
 14  co2TailpipeGpm   25479 non-null  float64
 15  comb08           25479 non-null  int64  
 16  comb08U          25479 non-null  float64
 17  combA08     

In [35]:
nan_cols = df.isna().sum()

nan_cols[nan_cols>0]

drive          1189
eng_dscr      13312
trany            11
guzzler       23896
trans_dscr    13478
tCharger      22688
sCharger      25068
atvType       23543
fuelType2     24516
rangeA        24521
evMotor       25240
mfrCode       23397
c240Dscr      25479
c240bDscr     25479
startStop     24277
phevCity          1
phevHwy           1
phevComb          1
dtype: int64

In [45]:
# media de cilindrada para coches gasolina, salvo el Subaru

df[(df.fuelType=='Regular') | (df.fuelType=='Premium') & (df.make!='Subaru')].displ.mean()

3.293657729509588

In [48]:
df[(df.fuelType=='Regular') | (df.fuelType=='Premium') & (df.make!='Subaru')]['displ'].mean()

3.293657729509588

In [46]:
# media de cilindrada para coches gasolina

df[(df.fuelType=='Regular') | (df.fuelType=='Premium')].displ.mean()

3.289132085020243

In [44]:
# para filtrar, se usan bitwise operators, and=& , or=|

df.loc[df.displ==0, ['make', 'model', 'cylinders', 'displ', 'fuelType']]

Unnamed: 0,make,model,cylinders,displ,fuelType
7138,Nissan,Altra EV,0.0,0.0,Electricity
7139,Toyota,RAV4 EV,0.0,0.0,Electricity
8143,Toyota,RAV4 EV,0.0,0.0,Electricity
8144,Ford,Th!nk,0.0,0.0,Electricity
8146,Ford,Explorer USPS Electric,0.0,0.0,Electricity
8147,Nissan,Hyper-Mini,0.0,0.0,Electricity
9212,Toyota,RAV4 EV,0.0,0.0,Electricity
9213,Ford,Explorer USPS Electric,0.0,0.0,Electricity
10329,Toyota,RAV4 EV,0.0,0.0,Electricity
21413,Subaru,RX Turbo,0.0,0.0,Regular


In [50]:
df.columns = [e.replace(' ', '_').lower() for e in df.columns]

df.columns

Index(['barrels08', 'barrelsa08', 'charge120', 'charge240', 'city08',
       'city08u', 'citya08', 'citya08u', 'citycd', 'citye', 'cityuf', 'co2',
       'co2a', 'co2tailpipeagpm', 'co2tailpipegpm', 'comb08', 'comb08u',
       'comba08', 'comba08u', 'combe', 'combinedcd', 'combineduf', 'cylinders',
       'displ', 'drive', 'engid', 'eng_dscr', 'fescore', 'fuelcost08',
       'fuelcosta08', 'fueltype', 'fueltype1', 'ghgscore', 'ghgscorea',
       'highway08', 'highway08u', 'highwaya08', 'highwaya08u', 'highwaycd',
       'highwaye', 'highwayuf', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgdata', 'phevblended', 'pv2', 'pv4', 'range', 'rangecity',
       'rangecitya', 'rangehwy', 'rangehwya', 'trany', 'ucity', 'ucitya',
       'uhighway', 'uhighwaya', 'vclass', 'year', 'yousavespend', 'guzzler',
       'trans_dscr', 'tcharger', 'scharger', 'atvtype', 'fueltype2', 'rangea',
       'evmotor', 'mfrcode', 'c240dscr', 'charge240b', 'c240bdscr',
       'createdon', 'modifiedon

In [51]:
df.id.head()

0        1
1       10
2      100
3     1000
4    10000
Name: id, dtype: int64

In [52]:
df.rename(columns = {'id': 'id%'}, inplace=True)

In [53]:
df.columns

Index(['barrels08', 'barrelsa08', 'charge120', 'charge240', 'city08',
       'city08u', 'citya08', 'citya08u', 'citycd', 'citye', 'cityuf', 'co2',
       'co2a', 'co2tailpipeagpm', 'co2tailpipegpm', 'comb08', 'comb08u',
       'comba08', 'comba08u', 'combe', 'combinedcd', 'combineduf', 'cylinders',
       'displ', 'drive', 'engid', 'eng_dscr', 'fescore', 'fuelcost08',
       'fuelcosta08', 'fueltype', 'fueltype1', 'ghgscore', 'ghgscorea',
       'highway08', 'highway08u', 'highwaya08', 'highwaya08u', 'highwaycd',
       'highwaye', 'highwayuf', 'hlv', 'hpv', 'id%', 'lv2', 'lv4', 'make',
       'model', 'mpgdata', 'phevblended', 'pv2', 'pv4', 'range', 'rangecity',
       'rangecitya', 'rangehwy', 'rangehwya', 'trany', 'ucity', 'ucitya',
       'uhighway', 'uhighwaya', 'vclass', 'year', 'yousavespend', 'guzzler',
       'trans_dscr', 'tcharger', 'scharger', 'atvtype', 'fueltype2', 'rangea',
       'evmotor', 'mfrcode', 'c240dscr', 'charge240b', 'c240bdscr',
       'createdon', 'modifiedo

In [54]:
df.id%

SyntaxError: invalid syntax (4061602312.py, line 1)

In [55]:
df['id%'].head()

0        1
1       10
2      100
3     1000
4    10000
Name: id%, dtype: int64

---

### Get numeric data

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25479 entries, 0 to 25478
Data columns (total 83 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        25479 non-null  float64
 1   barrelsa08       25479 non-null  float64
 2   charge120        25479 non-null  float64
 3   charge240        25479 non-null  float64
 4   city08           25479 non-null  int64  
 5   city08u          25479 non-null  float64
 6   citya08          25479 non-null  int64  
 7   citya08u         25479 non-null  float64
 8   citycd           25479 non-null  float64
 9   citye            25479 non-null  float64
 10  cityuf           25479 non-null  float64
 11  co2              25479 non-null  int64  
 12  co2a             25479 non-null  int64  
 13  co2tailpipeagpm  25479 non-null  float64
 14  co2tailpipegpm   25479 non-null  float64
 15  comb08           25479 non-null  int64  
 16  comb08u          25479 non-null  float64
 17  comba08     

In [59]:
df._get_numeric_data().head()

Unnamed: 0,barrels08,barrelsa08,charge120,charge240,city08,city08u,citya08,citya08u,citycd,citye,cityuf,co2,co2a,co2tailpipeagpm,co2tailpipegpm,comb08,comb08u,comba08,comba08u,combe,combinedcd,combineduf,cylinders,displ,engid,fescore,fuelcost08,fuelcosta08,ghgscore,ghgscorea,highway08,highway08u,highwaya08,highwaya08u,highwaycd,highwaye,highwayuf,hlv,hpv,id%,lv2,lv4,phevblended,pv2,pv4,range,rangecity,rangecitya,rangehwy,rangehwya,ucity,ucitya,uhighway,uhighwaya,year,yousavespend,c240dscr,charge240b,c240bdscr,phevcity,phevhwy,phevcomb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,9011,-1,1600,0,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,False,0,0,0,0.0,0.0,0.0,0.0,23.3333,0.0,35.0,0.0,1985,-1250,,0.0,,0.0,0.0,0.0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,22020,-1,3050,0,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,False,0,0,0,0.0,0.0,0.0,0.0,11.0,0.0,19.0,0.0,1985,-8500,,0.0,,0.0,0.0,0.0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,2100,-1,1250,0,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,False,0,0,0,0.0,0.0,0.0,0.0,29.0,0.0,47.0,0.0,1985,500,,0.0,,0.0,0.0,0.0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,2850,-1,3050,0,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,False,0,0,0,0.0,0.0,0.0,0.0,12.2222,0.0,16.6667,0.0,1985,-8500,,0.0,,0.0,0.0,0.0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,66031,-1,2150,0,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,False,0,90,0,0.0,0.0,0.0,0.0,21.0,0.0,32.0,0.0,1993,-4000,,0.0,,0.0,0.0,0.0


In [60]:
import numpy as np

In [62]:
type(np.nan)

float

In [63]:
'null'

'null'

In [66]:
nan_cols = df.isna().sum()

nan_cols[nan_cols>0]

drive          1189
eng_dscr      13312
trany            11
guzzler       23896
trans_dscr    13478
tcharger      22688
scharger      25068
atvtype       23543
fueltype2     24516
rangea        24521
evmotor       25240
mfrcode       23397
c240dscr      25479
c240bdscr     25479
startstop     24277
phevcity          1
phevhwy           1
phevcomb          1
dtype: int64

In [67]:
df._get_numeric_data().columns

Index(['barrels08', 'barrelsa08', 'charge120', 'charge240', 'city08',
       'city08u', 'citya08', 'citya08u', 'citycd', 'citye', 'cityuf', 'co2',
       'co2a', 'co2tailpipeagpm', 'co2tailpipegpm', 'comb08', 'comb08u',
       'comba08', 'comba08u', 'combe', 'combinedcd', 'combineduf', 'cylinders',
       'displ', 'engid', 'fescore', 'fuelcost08', 'fuelcosta08', 'ghgscore',
       'ghgscorea', 'highway08', 'highway08u', 'highwaya08', 'highwaya08u',
       'highwaycd', 'highwaye', 'highwayuf', 'hlv', 'hpv', 'id%', 'lv2', 'lv4',
       'phevblended', 'pv2', 'pv4', 'range', 'rangecity', 'rangecitya',
       'rangehwy', 'rangehwya', 'ucity', 'ucitya', 'uhighway', 'uhighwaya',
       'year', 'yousavespend', 'c240dscr', 'charge240b', 'c240bdscr',
       'phevcity', 'phevhwy', 'phevcomb'],
      dtype='object')

In [68]:
nan_cols[nan_cols>0].index

Index(['drive', 'eng_dscr', 'trany', 'guzzler', 'trans_dscr', 'tcharger',
       'scharger', 'atvtype', 'fueltype2', 'rangea', 'evmotor', 'mfrcode',
       'c240dscr', 'c240bdscr', 'startstop', 'phevcity', 'phevhwy',
       'phevcomb'],
      dtype='object')

In [69]:
for c in df._get_numeric_data().columns:
    
    if c in nan_cols[nan_cols>0].index:
        
        print(c)

c240dscr
c240bdscr
phevcity
phevhwy
phevcomb


In [72]:
df.phevcomb.tail()

25474    0.0
25475    0.0
25476    0.0
25477    0.0
25478    NaN
Name: phevcomb, dtype: float64

In [73]:
df.phevcomb.mean()

0.010715126776042076

In [74]:
df.phevcomb.sum()

273.0

---

### Changing columns data types

In [76]:
df.phevcomb = df.phevcomb.astype('str')

In [77]:
df.phevcomb.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: phevcomb, dtype: object

In [78]:
df.phevcomb[0]

'0.0'

In [83]:
df.phevcomb[25478]   # esto no es nulo, es string

'nan'

In [84]:
type(df.phevcomb[25478])

str

In [86]:
df.fillna('hola').head()

Unnamed: 0,barrels08,barrelsa08,charge120,charge240,city08,city08u,citya08,citya08u,citycd,citye,cityuf,co2,co2a,co2tailpipeagpm,co2tailpipegpm,comb08,comb08u,comba08,comba08u,combe,combinedcd,combineduf,cylinders,displ,drive,engid,eng_dscr,fescore,fuelcost08,fuelcosta08,fueltype,fueltype1,ghgscore,ghgscorea,highway08,highway08u,highwaya08,highwaya08u,highwaycd,highwaye,highwayuf,hlv,hpv,id%,lv2,lv4,make,model,mpgdata,phevblended,pv2,pv4,range,rangecity,rangecitya,rangehwy,rangehwya,trany,ucity,ucitya,uhighway,uhighwaya,vclass,year,yousavespend,guzzler,trans_dscr,tcharger,scharger,atvtype,fueltype2,rangea,evmotor,mfrcode,c240dscr,charge240b,c240bdscr,createdon,modifiedon,startstop,phevcity,phevhwy,phevcomb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0,0.0,Two Seaters,1985,-1250,hola,hola,hola,hola,hola,hola,hola,hola,hola,hola,0.0,hola,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,hola,0.0,0.0,0.0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0,0.0,19.0,0.0,Two Seaters,1985,-8500,T,hola,hola,hola,hola,hola,hola,hola,hola,hola,0.0,hola,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,hola,0.0,0.0,0.0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0,0.0,47.0,0.0,Subcompact Cars,1985,500,hola,SIL,hola,hola,hola,hola,hola,hola,hola,hola,0.0,hola,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,hola,0.0,0.0,0.0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,hola,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,hola,hola,hola,hola,hola,hola,hola,hola,hola,hola,0.0,hola,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,hola,0.0,0.0,0.0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0,0.0,32.0,0.0,Compact Cars,1993,-4000,hola,hola,T,hola,hola,hola,hola,hola,hola,hola,0.0,hola,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,hola,0.0,0.0,0.0


---

### Simple string transformation (string replace)

In [87]:
df = pd.read_csv('datasets/club.csv', index_col=0) 

df.head()

Unnamed: 0,Club Name,Competition Name,Squad Size,Average Age Of Players,Market Value Of Club In Millions(£),Average Market Value Of Players In Millions(£),Market Value Of Top 18 Players In Millions(£)
0,Manchester City,Premier League,24,27.1,970.02,40.42,920.7
1,Paris Saint-Germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,Manchester United,Premier League,29,27.9,820.13,28.28,742.5
3,Chelsea FC,Premier League,27,26.9,802.35,29.72,737.1
4,Liverpool FC,Premier League,27,27.2,779.85,28.88,715.95


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Club Name                                       100 non-null    object 
 1   Competition Name                                100 non-null    object 
 2   Squad Size                                      100 non-null    int64  
 3   Average Age Of Players                          100 non-null    float64
 4   Market Value Of Club In Millions(£)             100 non-null    float64
 5   Average Market Value Of Players In Millions(£)  100 non-null    float64
 6   Market Value Of Top 18 Players In Millions(£)   100 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 6.2+ KB


In [89]:
df.columns

Index(['Club Name', 'Competition Name', 'Squad Size', 'Average Age Of Players',
       'Market Value Of Club In Millions(£)',
       'Average Market Value Of Players In Millions(£)',
       'Market Value Of Top 18 Players In Millions(£)'],
      dtype='object')

In [92]:
df.columns = [e.replace(' ', '_').replace('(£)', '').lower() for e in df.columns]

In [93]:
df.head()

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,Manchester City,Premier League,24,27.1,970.02,40.42,920.7
1,Paris Saint-Germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,Manchester United,Premier League,29,27.9,820.13,28.28,742.5
3,Chelsea FC,Premier League,27,26.9,802.35,29.72,737.1
4,Liverpool FC,Premier League,27,27.2,779.85,28.88,715.95


> __But first, the `.unique()` method!!!__

In [94]:
df.club_name.unique()

array(['Manchester City', 'Paris Saint-Germain', 'Manchester United',
       'Chelsea FC', 'Liverpool FC', 'Bayern Munich', 'Real Madrid',
       'Atlético de Madrid', 'Tottenham Hotspur', 'FC Barcelona',
       'Borussia Dortmund', 'Juventus FC', 'Arsenal FC', 'Leicester City',
       'Inter Milan', 'SSC Napoli', 'RB Leipzig', 'AC Milan', 'AS Roma',
       'Everton FC', 'Sevilla FC', 'Atalanta BC', 'Aston Villa',
       'Bayer 04 Leverkusen', 'Wolverhampton Wanderers', 'Real Sociedad',
       'AS Monaco', 'West Ham United', 'Ajax Amsterdam', 'Olympique Lyon',
       'Villarreal CF', 'SS Lazio', 'VfL Wolfsburg', 'SL Benfica',
       'Borussia Mönchengladbach', 'LOSC Lille', 'FC Porto',
       'Leeds United', 'Valencia CF', 'Brighton & Hove Albion',
       'Southampton FC', 'ACF Fiorentina', 'Olympique Marseille',
       'Crystal Palace', 'Newcastle United', 'OGC Nice',
       'Real Betis Balompié', 'US Sassuolo', 'Stade Rennais FC',
       'Athletic Bilbao', 'TSG 1899 Hoffenheim', 'Ein

In [98]:
df.club_name  = df.club_name.str.replace(' ', '_').str.lower()

In [99]:
df.head()

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,manchester_city,Premier League,24,27.1,970.02,40.42,920.7
1,paris_saint-germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,manchester_united,Premier League,29,27.9,820.13,28.28,742.5
3,chelsea_fc,Premier League,27,26.9,802.35,29.72,737.1
4,liverpool_fc,Premier League,27,27.2,779.85,28.88,715.95


In [101]:
df.club_name.str.contains('paris').head()

0    False
1     True
2    False
3    False
4    False
Name: club_name, dtype: bool

In [102]:
df[df.club_name.str.contains('paris')] # filtro el df segun la string que contiene cierta columna

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
1,paris_saint-germain,Ligue 1,36,26.1,891.18,24.76,801.0


In [104]:
df[df.competition_name.str.contains('League')]

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,manchester_city,Premier League,24,27.1,970.02,40.42,920.7
2,manchester_united,Premier League,29,27.9,820.13,28.28,742.5
3,chelsea_fc,Premier League,27,26.9,802.35,29.72,737.1
4,liverpool_fc,Premier League,27,27.2,779.85,28.88,715.95
8,tottenham_hotspur,Premier League,24,25.6,627.3,26.14,558.0
12,arsenal_fc,Premier League,26,25.4,507.15,19.51,453.6
13,leicester_city,Premier League,28,27.1,493.29,17.62,447.3
19,everton_fc,Premier League,26,28.0,385.88,14.84,360.9
22,aston_villa,Premier League,25,25.6,371.79,14.87,354.42
24,wolverhampton_wanderers,Premier League,24,26.4,345.69,14.4,332.37


---

### Drop duplicates values 

In [107]:
df = df.drop_duplicates()

In [109]:
df[df.duplicated()]

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions


In [110]:
df.drop_duplicates(subset=['competition_name'])

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,manchester_city,Premier League,24,27.1,970.02,40.42,920.7
1,paris_saint-germain,Ligue 1,36,26.1,891.18,24.76,801.0
5,bayern_munich,Bundesliga,26,26.1,756.45,29.09,726.21
6,real_madrid,LaLiga,27,27.3,680.4,25.2,610.2
11,juventus_fc,Serie A,25,27.5,542.61,21.7,506.7
28,ajax_amsterdam,Eredivisie,25,25.7,304.65,12.19,282.6
33,sl_benfica,Liga Bwin,29,27.0,252.9,8.72,215.1
56,shakhtar_donetsk,Premier Liga,29,25.8,165.6,5.71,149.85
59,club_brugge_kv,Jupiler Pro League,30,24.3,150.08,5.0,136.71
63,clube_de_regatas_do_flamengo,Série A,32,27.4,133.74,4.18,117.0


In [112]:
df.drop_duplicates(subset=['club_name', 'competition_name']).head()

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,manchester_city,Premier League,24,27.1,970.02,40.42,920.7
1,paris_saint-germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,manchester_united,Premier League,29,27.9,820.13,28.28,742.5
3,chelsea_fc,Premier League,27,26.9,802.35,29.72,737.1
4,liverpool_fc,Premier League,27,27.2,779.85,28.88,715.95


---

### Apply...your new BFF!!!

In [113]:
df.head()

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,manchester_city,Premier League,24,27.1,970.02,40.42,920.7
1,paris_saint-germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,manchester_united,Premier League,29,27.9,820.13,28.28,742.5
3,chelsea_fc,Premier League,27,26.9,802.35,29.72,737.1
4,liverpool_fc,Premier League,27,27.2,779.85,28.88,715.95


In [116]:
%%time

# esto aplica una funcion a cada elemento de la columna, como el map

df.market_value_of_club_in_millions  = df.market_value_of_club_in_millions.apply(lambda x: x / df.market_value_of_club_in_millions.max())

CPU times: user 5.73 ms, sys: 831 µs, total: 6.56 ms
Wall time: 5.9 ms


In [117]:
df.head()

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,manchester_city,Premier League,24,27.1,1.0,40.42,920.7
1,paris_saint-germain,Ligue 1,36,26.1,0.918723,24.76,801.0
2,manchester_united,Premier League,29,27.9,0.845477,28.28,742.5
3,chelsea_fc,Premier League,27,26.9,0.827148,29.72,737.1
4,liverpool_fc,Premier League,27,27.2,0.803952,28.88,715.95


In [127]:
df.market_value_of_club_in_millions.max()

1.0

In [118]:
%%time

df.market_value_of_club_in_millions / df.market_value_of_club_in_millions.max()

CPU times: user 884 µs, sys: 241 µs, total: 1.13 ms
Wall time: 1.12 ms


0     1.000000
1     0.918723
2     0.845477
3     0.827148
4     0.803952
        ...   
95    0.091947
96    0.091947
97    0.091349
98    0.090019
99    0.089627
Name: market_value_of_club_in_millions, Length: 100, dtype: float64

In [119]:
def nombre_funcion(x):
    
    x = x.replace(' ', '_').lower()
    
    return x
    

In [122]:
df.competition_name = df.competition_name.apply(nombre_funcion)

In [123]:
df.head()

Unnamed: 0,club_name,competition_name,squad_size,average_age_of_players,market_value_of_club_in_millions,average_market_value_of_players_in_millions,market_value_of_top_18_players_in_millions
0,manchester_city,premier_league,24,27.1,1.0,40.42,920.7
1,paris_saint-germain,ligue_1,36,26.1,0.918723,24.76,801.0
2,manchester_united,premier_league,29,27.9,0.845477,28.28,742.5
3,chelsea_fc,premier_league,27,26.9,0.827148,29.72,737.1
4,liverpool_fc,premier_league,27,27.2,0.803952,28.88,715.95


In [126]:
df.market_value_of_top_18_players_in_millions + 3

0     923.70
1     804.00
2     745.50
3     740.10
4     718.95
       ...  
95     85.35
96     82.74
97     79.46
98     80.85
99     75.27
Name: market_value_of_top_18_players_in_millions, Length: 100, dtype: float64

In [132]:
df.squad_size.max()

36

In [134]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   club_name                                    100 non-null    object 
 1   competition_name                             100 non-null    object 
 2   squad_size                                   100 non-null    int64  
 3   average_age_of_players                       100 non-null    float64
 4   market_value_of_club_in_millions             100 non-null    float64
 5   average_market_value_of_players_in_millions  100 non-null    float64
 6   market_value_of_top_18_players_in_millions   100 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 18.5 KB


In [135]:
df.squad_size = df.squad_size.astype('int8')

In [136]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   club_name                                    100 non-null    object 
 1   competition_name                             100 non-null    object 
 2   squad_size                                   100 non-null    int8   
 3   average_age_of_players                       100 non-null    float64
 4   market_value_of_club_in_millions             100 non-null    float64
 5   average_market_value_of_players_in_millions  100 non-null    float64
 6   market_value_of_top_18_players_in_millions   100 non-null    float64
dtypes: float64(4), int8(1), object(2)
memory usage: 17.9 KB


In [140]:
fcols = df.select_dtypes('float').columns
icols = df.select_dtypes('integer').columns

In [141]:
fcols

Index(['average_age_of_players', 'market_value_of_club_in_millions',
       'average_market_value_of_players_in_millions',
       'market_value_of_top_18_players_in_millions'],
      dtype='object')

In [142]:
icols

Index(['squad_size'], dtype='object')

In [143]:
df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')

In [144]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   club_name                                    100 non-null    object 
 1   competition_name                             100 non-null    object 
 2   squad_size                                   100 non-null    int8   
 3   average_age_of_players                       100 non-null    float32
 4   market_value_of_club_in_millions             100 non-null    float32
 5   average_market_value_of_players_in_millions  100 non-null    float32
 6   market_value_of_top_18_players_in_millions   100 non-null    float32
dtypes: float32(4), int8(1), object(2)
memory usage: 16.3 KB


In [145]:
cols = df.select_dtypes('object').columns

cols

Index(['club_name', 'competition_name'], dtype='object')

In [147]:
df[cols] = df[cols].astype('category')

In [148]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                                       Non-Null Count  Dtype   
---  ------                                       --------------  -----   
 0   club_name                                    100 non-null    category
 1   competition_name                             100 non-null    category
 2   squad_size                                   100 non-null    int8    
 3   average_age_of_players                       100 non-null    float32 
 4   market_value_of_club_in_millions             100 non-null    float32 
 5   average_market_value_of_players_in_millions  100 non-null    float32 
 6   market_value_of_top_18_players_in_millions   100 non-null    float32 
dtypes: category(2), float32(4), int8(1)
memory usage: 15.6 KB


In [151]:
df.club_name.unique()

['manchester_city', 'paris_saint-germain', 'manchester_united', 'chelsea_fc', 'liverpool_fc', ..., 'levante_ud', 'fc_metz', 'clube_atlético_mineiro', 'lokomotiv_moscow', 'genoa_cfc']
Length: 100
Categories (100, object): ['1.fsv_mainz_05', 'ac_milan', 'acf_fiorentina', 'afc_bournemouth', ..., 'watford_fc', 'west_ham_united', 'wolverhampton_wanderers', 'zenit_st._petersburg']

---

__Other resources to consider:__

- https://github.com/ResidentMario/missingno
