# **Working with DataFrames**

## **Intro**

In [1]:
import pandas as pd 
import numpy as np

In [2]:
names = ['Olga', 'Andrew', 'Brian', 'Telulah', 'Nicole', 'Tilda']
ages =[29, 21, 45, 23, 39,46]
married = [False, True, True, True, False, True]

In [3]:
df = pd.DataFrame({'name':names, 'age':ages, 'married': married})

In [4]:
df

Unnamed: 0,name,age,married
0,Olga,29,False
1,Andrew,21,True
2,Brian,45,True
3,Telulah,23,True
4,Nicole,39,False
5,Tilda,46,True


In [5]:
ser = pd.Series(names, name='name')

In [6]:
ser

0       Olga
1     Andrew
2      Brian
3    Telulah
4     Nicole
5      Tilda
Name: name, dtype: object

In [7]:
ser[2]

'Brian'

In [8]:
df.iloc[2]

name       Brian
age           45
married     True
Name: 2, dtype: object

In [9]:
ser.ndim

1

In [10]:
df.ndim

2

In [11]:
df.shape

(6, 3)

In [12]:
ser.shape

(6,)

In [13]:
df['name']

0       Olga
1     Andrew
2      Brian
3    Telulah
4     Nicole
5      Tilda
Name: name, dtype: object

In [14]:
# A dataframe doesn't has a dtype, instead it has dtypes
df.dtypes

name       object
age         int64
married      bool
dtype: object

## **Creating a DataFrame**

### **More ways to DataFrame**
* **dict of tuples:** like dict of lists but with tuples (column-wise)
* **dict of dicts:** key:value pairs with column names as keys and index-labeled key:value pairs containing values (column-wise)
* **dict of series:** (column-wise)
* **list of dicts:** (row-wise)

### **dict of dicts**

In [15]:
list(enumerate(names))

[(0, 'Olga'),
 (1, 'Andrew'),
 (2, 'Brian'),
 (3, 'Telulah'),
 (4, 'Nicole'),
 (5, 'Tilda')]

In [16]:
dict_names = {k:v for k,v in enumerate(names)}

In [17]:
dict_ages = {k:v for k,v in enumerate(ages)}

In [18]:
dict_married = {k:v for k,v in enumerate(married)}

In [19]:
pd.DataFrame({'name':dict_names,
              'age': dict_ages,
              'married': dict_married})

Unnamed: 0,name,age,married
0,Olga,29,False
1,Andrew,21,True
2,Brian,45,True
3,Telulah,23,True
4,Nicole,39,False
5,Tilda,46,True


### **list of dicts**

In [20]:
rowwise = [{'name':names, 'age':ages, 'married':married} for names, ages, married in zip(names, ages, married)]

In [21]:
pd.DataFrame(rowwise)

Unnamed: 0,name,age,married
0,Olga,29,False
1,Andrew,21,True
2,Brian,45,True
3,Telulah,23,True
4,Nicole,39,False
5,Tilda,46,True


## **`info()` method**

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     6 non-null      object
 1   age      6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 230.0+ bytes


In [23]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Columns: 3 entries, name to married
dtypes: bool(1), int64(1), object(1)
memory usage: 230.0+ bytes


In [24]:
df.info(memory_usage=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     6 non-null      object
 1   age      6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)

In [25]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     6 non-null      object
 1   age      6 non-null      int64 
 2   married  6 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 557.0 bytes


## **Reading in Nutrition Data**

In [26]:
dataurl='https://andybek.com/pandas-nutrition'

In [27]:
nutrition = pd.read_csv(dataurl, index_col=[0])

In [28]:
nutrition.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [29]:
nutrition.info(verbose=False, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8789 entries, 0 to 8788
Columns: 76 entries, name to water
dtypes: int64(2), object(74)
memory usage: 39.9 MB


## **Some cleanup: removing the duplicated index**

In [30]:
nutrition.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [31]:
# Remove duplicated column
# nutrition.drop('Unnamed: 0', axis=1, inplace=True)
# Use th Unnamed: 0 column as the index
# nutrition.set_index('Unnamed: 0')

## **The `sample()` method**

In [32]:
nutrition.sample()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
3685,"Caribou, raw (Alaska Native), hind quarter meat",100 g,122,3.4g,1.3g,81mg,52.00 mg,0,4.00 mcg,0.00 mcg,...,3.37 g,1.270 g,0.940 g,0.560 g,81.00 mg,0.0 g,1.23 g,0.00 mg,0,72.60 g


In [33]:
nutrition.sample(random_state=12)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
3713,"Thuringer, pork, beef, summer sausage, cervelat",100 g,362,30g,12g,74mg,1300.00 mg,78.9 mg,2.00 mcg,0.00 mcg,...,30.43 g,11.510 g,12.970 g,1.200 g,74.00 mg,0.0 g,3.63 g,0.00 mg,0.00 mg,45.18 g


In [34]:
nutrition.sample(frac=.01)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
4774,"Soup, prepared with equal volume water, canned...",100 g,61,1.1g,0.5g,0,336.00 mg,13.2 mg,1.00 mcg,0.00 mcg,...,1.09 g,0.524 g,0.372 g,0.142 g,0.00 mg,0.0 g,1.28 g,0.00 mg,0.00 mg,84.55 g
4258,"Peas, with salt, boiled, cooked, mature seeds,...",100 g,116,0.4g,0.1g,0,238.00 mg,32.8 mg,65.00 mcg,0.00 mcg,...,0.39 g,0.054 g,0.081 g,0.165 g,0.00 mg,0,1.27 g,0,0,69.49 g
3557,"OLIVE GARDEN, cheese ravioli with marinara sauce",100 g,159,5.6g,2.5g,23mg,269.00 mg,0,44.00 mcg,0,...,5.61 g,2.531 g,1.428 g,0.547 g,23.00 mg,0,1.36 g,0,0,65.96 g
7101,"Beef, raw, choice, trimmed to 1/8"" fat, separa...",100 g,237,17g,6.9g,82mg,54.00 mg,86.1 mg,6.00 mcg,0,...,17.10 g,6.910 g,7.410 g,0.610 g,82.00 mg,0.0 g,0.79 g,0.00 mg,0.00 mg,61.49 g
1067,"Barbecue loaf, beef, pork",100 g,173,8.9g,3.2g,37mg,1334.00 mg,0,9.00 mcg,0.00 mcg,...,8.90 g,3.170 g,4.140 g,0.810 g,37.00 mg,0,4.04 g,0,0,64.82 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,"Vinegar, distilled",100 g,18,0g,,0,2.00 mg,0.0 mg,0.00 mcg,0.00 mcg,...,0.00 g,0.000 g,0.000 g,0.000 g,0.00 mg,0.0 g,0.02 g,0.00 mg,0.00 mg,94.78 g
5143,"Pork, raw, separable lean only, whole, leg (ha...",100 g,136,5.4g,1.9g,68mg,55.00 mg,72.3 mg,9.00 mcg,0.00 mcg,...,5.41 g,1.870 g,2.440 g,0.580 g,68.00 mg,0.0 g,1.05 g,0.00 mg,0.00 mg,72.90 g
6630,"MORNINGSTAR FARMS Tomato & Basil Pizza Burger,...",100 g,161,8.6g,2.5g,10mg,414.00 mg,0,11.00 mcg,0,...,8.60 g,2.500 g,2.100 g,2.900 g,10.00 mg,0,2.00 g,0,0,59.90 g
8405,"Beef, braised, cooked, select, trimmed to 0"" f...",100 g,221,10g,3g,99mg,70.00 mg,101.9 mg,8.00 mcg,0.00 mcg,...,10.24 g,3.000 g,6.400 g,0.690 g,99.00 mg,0.0 g,1.12 g,0.00 mg,0.00 mg,57.54 g


## **Sampling with Replacement or Weights**

In [35]:
# with or without replacement

In [36]:
nutrition.sample(n=3, replace=True)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
3213,Soy sauce made from soy and wheat (shoyu),100 g,53,0.6g,0.1g,0,5493.00 mg,18.3 mg,14.00 mcg,0.00 mcg,...,0.57 g,0.073 g,0.088 g,0.263 g,0.00 mg,0.0 g,15.21 g,0.00 mg,0.00 mg,71.15 g
1329,"Hominy, yellow, canned",100 g,72,0.9g,0.1g,0,345.00 mg,0,1.00 mcg,0.00 mcg,...,0.88 g,0.123 g,0.231 g,0.399 g,0.00 mg,0,0.86 g,0,0,82.53 g
6525,"Whale, skin and subcutaneous fat (muktuk) (Ala...",100 g,465,46g,6.6g,54mg,0,0,0,0,...,46.10 g,6.560 g,28.120 g,7.970 g,54.00 mg,0,0.10 g,0,0,40.00 g


In [37]:
weights = pd.Series(data=[10,10,10,1,2], index=[7,17,29,5,6])

In [38]:
nutrition.sample(n=3, weights=weights)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
17,"Peppers, raw, jalapeno",100 g,29,0.4g,0.1g,0,3.00 mg,7.5 mg,27.00 mcg,0.00 mcg,...,0.37 g,0.092 g,0.029 g,0.112 g,0.00 mg,0.0 g,0.53 g,0.00 mg,0.00 mg,91.69 g
29,"Nuts, dried, pine nuts",100 g,673,68g,4.9g,0,2.00 mg,55.8 mg,34.00 mcg,0.00 mcg,...,68.37 g,4.899 g,18.764 g,34.071 g,0.00 mg,0.0 g,2.59 g,0.00 mg,0.00 mg,2.28 g
6,"Taro leaves, raw",100 g,42,0.7g,0.2g,0,3.00 mg,12.8 mg,126.00 mcg,0.00 mcg,...,0.74 g,0.151 g,0.060 g,0.307 g,0.00 mg,0.0 g,1.92 g,0.00 mg,0.00 mg,85.66 g


## **DataFrame Axes**

In [39]:
nutrition.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [40]:
nutrition.axes

[Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
             ...
             8779, 8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788],
            dtype='int64', length=8789),
 Index(['name', 'serving_size', 'calories', 'total_fat', 'saturated_fat',
        'cholesterol', 'sodium', 'choline', 'folate', 'folic_acid', 'niacin',
        'pantothenic_acid', 'riboflavin', 'thiamin', 'vitamin_a',
        'vitamin_a_rae', 'carotene_alpha', 'carotene_beta',
        'cryptoxanthin_beta', 'lutein_zeaxanthin', 'lucopene', 'vitamin_b12',
        'vitamin_b6', 'vitamin_c', 'vitamin_d', 'vitamin_e', 'tocopherol_alpha',
        'vitamin_k', 'calcium', 'copper', 'irom', 'magnesium', 'manganese',
        'phosphorous', 'potassium', 'selenium', 'zink', 'protein', 'alanine',
        'arginine', 'aspartic_acid', 'cystine', 'glutamic_acid', 'glycine',
        'histidine', 'hydroxyproline', 'isoleucine', 'leucine', 'lysine',
        'methionine', 'phenylalanine', 'proline', 'ser

In [41]:
nutrition.index[3]

3

In [42]:
nutrition.axes[1][69]

'polyunsaturated_fatty_acids'

In [43]:
nutrition.columns[69]

'polyunsaturated_fatty_acids'

In [44]:
# the axis params

In [45]:
nutrition.dropna(axis=0)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
5,"Cauliflower, raw",100 g,25,0.3g,0.1g,0,30.00 mg,44.3 mg,57.00 mcg,0.00 mcg,...,0.28 g,0.130 g,0.034 g,0.031 g,0.00 mg,0.0 g,0.76 g,0.00 mg,0.00 mg,92.07 g
6,"Taro leaves, raw",100 g,42,0.7g,0.2g,0,3.00 mg,12.8 mg,126.00 mcg,0.00 mcg,...,0.74 g,0.151 g,0.060 g,0.307 g,0.00 mg,0.0 g,1.92 g,0.00 mg,0.00 mg,85.66 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8784,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,125,3.5g,1.4g,62mg,54.00 mg,64.5 mg,4.00 mcg,0.00 mcg,...,3.50 g,1.353 g,1.554 g,0.244 g,62.00 mg,0.0 g,1.11 g,0.00 mg,0.00 mg,72.51 g
8785,"Lamb, cooked, separable lean only, composite o...",100 g,206,8.9g,3.9g,109mg,50.00 mg,0,0.00 mcg,0.00 mcg,...,8.86 g,3.860 g,3.480 g,0.520 g,109.00 mg,0,1.60 g,0,0,59.95 g
8786,"Lamb, raw, separable lean and fat, composite o...",100 g,277,23g,12g,78mg,39.00 mg,0,1.00 mcg,0.00 mcg,...,22.74 g,11.570 g,8.720 g,0.980 g,78.00 mg,0,0.92 g,0,0,59.80 g
8787,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,121,3g,1.1g,60mg,53.00 mg,64.2 mg,4.00 mcg,0.00 mcg,...,3.04 g,1.086 g,1.266 g,0.233 g,60.00 mg,0.0 g,1.10 g,0.00 mg,0.00 mg,73.43 g


## **Changing the Index**

In [46]:
nutrition.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            8779, 8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788],
           dtype='int64', length=8789)

In [47]:
type(nutrition.index)

pandas.core.indexes.numeric.Int64Index

`RangeIndex` is a special case of Int64Index
* both are immutable, sequences of numbers
* `RangeIndex` is an optimized alternative

`pd.RangeIndex(start=0, stop=8789, step=1)`

In [48]:
nutrition.index = pd.RangeIndex(start=0, stop=8789, step=1)

In [49]:
type(nutrition.index)

pandas.core.indexes.range.RangeIndex

In [50]:
# You can add another column as index and keep the column in the dataframe as a normal column
nutrition.set_index('folic_acid', drop=False)

Unnamed: 0_level_0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
folic_acid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.00 mcg,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
0.00 mcg,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
0.00 mcg,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
0,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
0.00 mcg,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.00 mcg,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,125,3.5g,1.4g,62mg,54.00 mg,64.5 mg,4.00 mcg,0.00 mcg,...,3.50 g,1.353 g,1.554 g,0.244 g,62.00 mg,0.0 g,1.11 g,0.00 mg,0.00 mg,72.51 g
0.00 mcg,"Lamb, cooked, separable lean only, composite o...",100 g,206,8.9g,3.9g,109mg,50.00 mg,0,0.00 mcg,0.00 mcg,...,8.86 g,3.860 g,3.480 g,0.520 g,109.00 mg,0,1.60 g,0,0,59.95 g
0.00 mcg,"Lamb, raw, separable lean and fat, composite o...",100 g,277,23g,12g,78mg,39.00 mg,0,1.00 mcg,0.00 mcg,...,22.74 g,11.570 g,8.720 g,0.980 g,78.00 mg,0,0.92 g,0,0,59.80 g
0.00 mcg,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,121,3g,1.1g,60mg,53.00 mg,64.2 mg,4.00 mcg,0.00 mcg,...,3.04 g,1.086 g,1.266 g,0.233 g,60.00 mg,0.0 g,1.10 g,0.00 mg,0.00 mg,73.43 g


In [51]:
nutrition.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [52]:
# You can also create a multi-level index by using append=True
# verify_integrity=False you dont care if you have repeated index values if verify_integrity=True than you care
nutrition.set_index('folic_acid', drop=False, append=True, verify_integrity=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
Unnamed: 0_level_1,folic_acid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0.00 mcg,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,0.00 mcg,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,0.00 mcg,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,0,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,0.00 mcg,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8784,0.00 mcg,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,125,3.5g,1.4g,62mg,54.00 mg,64.5 mg,4.00 mcg,0.00 mcg,...,3.50 g,1.353 g,1.554 g,0.244 g,62.00 mg,0.0 g,1.11 g,0.00 mg,0.00 mg,72.51 g
8785,0.00 mcg,"Lamb, cooked, separable lean only, composite o...",100 g,206,8.9g,3.9g,109mg,50.00 mg,0,0.00 mcg,0.00 mcg,...,8.86 g,3.860 g,3.480 g,0.520 g,109.00 mg,0,1.60 g,0,0,59.95 g
8786,0.00 mcg,"Lamb, raw, separable lean and fat, composite o...",100 g,277,23g,12g,78mg,39.00 mg,0,1.00 mcg,0.00 mcg,...,22.74 g,11.570 g,8.720 g,0.980 g,78.00 mg,0,0.92 g,0,0,59.80 g
8787,0.00 mcg,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,121,3g,1.1g,60mg,53.00 mg,64.2 mg,4.00 mcg,0.00 mcg,...,3.04 g,1.086 g,1.266 g,0.233 g,60.00 mg,0.0 g,1.10 g,0.00 mg,0.00 mg,73.43 g


## **Extracting from DataFrames by Label `.loc`**

In [53]:
nutrition.set_index('name', drop=True,inplace=True);
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [54]:
nutrition.loc['Eggplant, raw']

serving_size       100 g
calories              25
total_fat           0.2g
saturated_fat        NaN
cholesterol            0
                  ...   
alcohol            0.0 g
ash               0.66 g
caffeine         0.00 mg
theobromine      0.00 mg
water            92.30 g
Name: Eggplant, raw, Length: 75, dtype: object

In [55]:
type(nutrition.loc['Eggplant, raw'])

pandas.core.series.Series

In [56]:
nutrition.loc['Eggplant, raw']['calories']

25

In [57]:
nutrition.loc['Eggplant, raw','calories']

25

In [58]:
nutrition.loc['Eggplant, raw': 'Sherbet, orange','calories':'cholesterol']

Unnamed: 0_level_0,calories,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Eggplant, raw",25,0.2g,,0
"Teff, uncooked",367,2.4g,0.4g,0
"Sherbet, orange",144,2g,1.2g,1mg


In [59]:
nutrition.loc[['Raspberries, raw', 'Eggplant, raw'], ['protein', 'vitamin_b6','water']]

Unnamed: 0_level_0,protein,vitamin_b6,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Raspberries, raw",1.20 g,0.055 mg,85.75 g
"Eggplant, raw",0.98 g,0.084 mg,92.30 g


## **DataFrame Extraction by Position `.iloc`**

In [60]:
nutrition.iloc[3, :]

serving_size      100 g
calories            367
total_fat          2.4g
saturated_fat      0.4g
cholesterol           0
                  ...  
alcohol               0
ash              2.37 g
caffeine              0
theobromine           0
water            8.82 g
Name: Teff, uncooked, Length: 75, dtype: object

In [61]:
nutrition.iloc[3]

serving_size      100 g
calories            367
total_fat          2.4g
saturated_fat      0.4g
cholesterol           0
                  ...  
alcohol               0
ash              2.37 g
caffeine              0
theobromine           0
water            8.82 g
Name: Teff, uncooked, Length: 75, dtype: object

In [62]:
nutrition.iloc[[4,6,9],2]

name
Sherbet, orange         2g
Taro leaves, raw      0.7g
Vegetarian fillets     18g
Name: total_fat, dtype: object

In [63]:
nutrition.iloc[[4,6,9],2:5]

Unnamed: 0_level_0,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Sherbet, orange",2g,1.2g,1mg
"Taro leaves, raw",0.7g,0.2g,0
Vegetarian fillets,18g,2.8g,0


In [64]:
# boolean masks

In [65]:
new_nutr = nutrition.iloc[
                            [True if i%2==0 else False for i in range(8789)],
                            [True if i%2==0 else False for i in range(75)]
]

In [66]:
nutrition.shape

(8789, 75)

In [67]:
new_nutr.shape

(4395, 38)

## **Single value access with `.at` and `.iat`**

Why use `.at` or `.iat`?
* **Single-Purpose:** unlike `.loc` or `.iloc`, `.at` and `iat` are only used for accessing single values
* **Faster:** because of lack of overhead, they are much more performant for their isolated use-case

In [68]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [69]:
nutrition.loc['Nuts, pecans', 'calories']

691

In [70]:
nutrition.iloc[1,1]

691

In [71]:
nutrition.at['Nuts, pecans', 'calories']

691

In [72]:
nutrition.iat[1,1]

691

In [73]:
%timeit nutrition.at['Nuts, pecans', 'calories']

3.68 µs ± 192 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [74]:
%timeit nutrition.loc['Nuts, pecans', 'calories']

6.21 µs ± 163 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## **The `get_loc()` Method**

In [75]:
# 1-> Get label from position

In [76]:
index_label = nutrition.index[2]

In [77]:
column_label= 'vitamin_k'

In [78]:
nutrition.loc[index_label, column_label]

'3.5 mcg'

In [79]:
nutrition.at[index_label, column_label]

'3.5 mcg'

In [80]:
# 2 -> Get int location from label

In [81]:
nutrition.columns.get_loc('vitamin_k')

26

In [82]:
column_loc = 26

In [83]:
index_loc = 2

In [84]:
nutrition.iloc[index_loc, column_loc]

'3.5 mcg'

In [85]:
nutrition.iat[index_loc, column_loc]

'3.5 mcg'

## **Skill Challenge**

In [86]:
# Randomly select 10 food items and assign the resulting dataframe to nutr_mini
nutr_mini = nutrition.sample(n=10, random_state=12, axis=0)

In [87]:
# From nutr_mini extract the total_fat and cholesterol columns for all rows
nutr_mini.loc[:,['total_fat', 'cholesterol']]

Unnamed: 0_level_0,total_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Thuringer, pork, beef, summer sausage, cervelat",30g,74mg
"Milk, without added vitamin A and vitamin D, instant, nonfat, dry",0.7g,18mg
"Babyfood, apple and peach, juice",0.1g,0
"Blueberries, solids and liquids, heavy syrup, canned",0.3g,0
"Rice, uncooked, enriched, short-grain, white",0.5g,0
"Beef, grilled, cooked, all grades, trimmed to 0"" fat, separable lean and fat, boneless, shoulder top blade steak",11g,95mg
"Child formula, not reconstituted, powder, with iron, PORTAGEN, MEAD JOHNSON",22g,4mg
"PEPPERIDGE FARM, 100% Whole Wheat Hamburger Buns",3.1g,1mg
"Beef macaroni with tomato sauce, reduced fat, frozen entree",2g,10mg
"Cowpeas, with salt, drained, boiled, cooked, young pods with seeds",0.3g,0


In [88]:
col_in = nutr_mini.columns.get_loc('vitamin_b12')

In [89]:
# Extract all the columns from vitamin_b12 to the end, for the first, second, and third rows
nutr_mini.iloc[0:3,col_in:]

Unnamed: 0_level_0,vitamin_b12,vitamin_b6,vitamin_c,vitamin_d,vitamin_e,tocopherol_alpha,vitamin_k,calcium,copper,irom,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Thuringer, pork, beef, summer sausage, cervelat",5.50 mcg,0.260 mg,16.6 mg,44.00 IU,0.22 mg,0.22 mg,1.3 mcg,9.00 mg,0.150 mg,2.04 mg,...,30.43 g,11.510 g,12.970 g,1.200 g,74.00 mg,0.0 g,3.63 g,0.00 mg,0.00 mg,45.18 g
"Milk, without added vitamin A and vitamin D, instant, nonfat, dry",3.99 mcg,0.345 mg,5.6 mg,0.00 IU,0.01 mg,0.01 mg,0.0 mcg,1231.00 mg,0.041 mg,0.31 mg,...,0.72 g,0.470 g,0.190 g,0.030 g,18.00 mg,0.0 g,8.03 g,0.00 mg,0.00 mg,3.96 g
"Babyfood, apple and peach, juice",0.00 mcg,0.022 mg,58.5 mg,0.00 IU,0.15 mg,0.15 mg,0.5 mcg,3.00 mg,0.037 mg,0.56 mg,...,0.10 g,0.018 g,0.002 g,0.031 g,0.00 mg,0.0 g,0.30 g,0.00 mg,0.00 mg,89.00 g


In [91]:
index_row = nutr_mini.index[2]
index_col = 'calories'
nutr_mini.at[index_row,index_col]

43

In [92]:
icol = nutr_mini.columns.get_loc('calories')
irow = 2
nutr_mini.iat[irow, icol]

43

## **More Cleanup: Going Numeric**

In [93]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [94]:
nutrition.total_fat.head()

name
Cornstarch         0.1g
Nuts, pecans        72g
Eggplant, raw      0.2g
Teff, uncooked     2.4g
Sherbet, orange      2g
Name: total_fat, dtype: object

In [99]:
nutrition.total_fat.sum()

'0.1g72g0.2g2.4g2g0.3g0.7g23g24g18g0g0.4g0.1g7.2g15g4.5g16g0.4g0.9g9.2g0.5g1.7g5.9g33g15g0g0g3g0.4g68g1.5g0.2g11g0.8g16g0.2g0.5g50g0.3g0.6g3g16g3.1g11g2.4g0g0.3g0.5g99g0.1g5.3g5.3g0.4g5.3g0.2g0.3g6.4g22g1g14g10g17g14g4.3g22g36g0.7g100g27g2.8g5.8g14g0.7g1.5g11g34g0.2g0.9g4.6g0g0.4g19g0.2g10g28g6.7g6.7g0.3g0.3g44g37g14g50g0.9g0.7g0.1g0.7g8g10g19g9.2g0g0.2g8.6g1.3g14g0.4g0.2g8.2g5.2g3.3g2.1g0.1g80g0.5g1.4g0.4g3.3g0.2g0.7g0.1g9.4g0.3g0.6g29g0.2g1.4g0.2g0.4g1.2g1.8g0g0.9g0.2g1.4g0.8g1.5g9.8g0g13g16g19g7.4g0.2g5.2g29g0.3g9.9g22g14g15g4.1g0.5g3.5g15g20g32g1g81g0.2g1.6g0g17g22g7.1g7.4g0.2g8.7g1.4g34g0.3g6.3g30g8.1g0.2g0.1g3.7g0.6g0.3g3.9g0.3g1.2g29g14g26g1.1g2g13g0g9g22g3.7g100g0.1g2.1g2.1g0.3g0.2g6.8g8.1g8.3g2.4g0.2g23g6.7g1g0.3g0.2g0.1g12g17g0.1g100g3.4g0.2g6.7g1.5g22g0g1g25g34g3.6g100g7.3g9g1.6g0.4g0.3g18g0.4g9.5g1.4g11g3.1g1.5g0g2.4g3.6g0g15g1.2g6.6g0.7g4.2g15g0.1g0.5g0.2g0.1g3.5g3g0.1g0.4g0.1g0.3g100g31g2.8g9.7g2.3g11g1.7g0.1g1.7g0.5g4.7g0.5g0.2g8.6g18g25g0.5g0.5g26g7.7g0.1g0.1g100g1.1g16

In [96]:
"Andy" + "Bek"

'AndyBek'

In [97]:
nutrition.total_fat.max()

'9g'

In [98]:
nutrition.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 8789 entries, Cornstarch to Beef, raw, all grades, trimmed to 0" fat, separable lean only, boneless, eye of round steak, round
Columns: 75 entries, serving_size to water
dtypes: int64(2), object(73)
memory usage: 5.4+ MB


## **The `astype()` method**

In [101]:
df= pd.DataFrame({'age':[12,13,14,15],
                 'weight': [41.1, 34.5, 83.2, 90.1],
                 'height': ['1.72', '1.74', '1.91', '1.54']})

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     4 non-null      int64  
 1   weight  4 non-null      float64
 2   height  4 non-null      object 
dtypes: float64(1), int64(1), object(1)
memory usage: 224.0+ bytes


In [105]:
# DataFrame without changing type of columns
df

Unnamed: 0,age,weight,height
0,12,41.1,1.72
1,13,34.5,1.74
2,14,83.2,1.91
3,15,90.1,1.54


In [107]:
# Changed all the columns to floats
# IMPORTANT: .astype() returns a copy of tha DataFrame, to keep it we have to reassign it
df = df.astype(float)

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     4 non-null      float64
 1   weight  4 non-null      float64
 2   height  4 non-null      float64
dtypes: float64(3)
memory usage: 224.0 bytes


In [112]:
# We can also choose specific columns to change type
df = df.astype({'age': np.int16})

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     4 non-null      int16  
 1   weight  4 non-null      float64
 2   height  4 non-null      float64
dtypes: float64(2), int16(1)
memory usage: 200.0 bytes


In [114]:
nutrition.iloc[:4,:]

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g


## **DataFrame `replace()` + A Glimpse at regex**

In [118]:
dfm = nutrition.iloc[:6,:1]

In [119]:
nutrition.iloc[:6,:1].info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Cornstarch to Cauliflower, raw
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   serving_size  6 non-null      object
dtypes: object(1)
memory usage: 96.0+ bytes


In [122]:
dfm.replace(to_replace='100 g', value=100)

Unnamed: 0_level_0,serving_size
name,Unnamed: 1_level_1
Cornstarch,100
"Nuts, pecans",100
"Eggplant, raw",100
"Teff, uncooked",100
"Sherbet, orange",100
"Cauliflower, raw",100


In [125]:
# Regex
#\s -> space and \sg -> space + g

In [126]:
df.replace('\sg','',regex=True)

Unnamed: 0,age,weight,height
0,12,41.1,1.72
1,13,34.5,1.74
2,14,83.2,1.91
3,15,90.1,1.54


## **Part I: Collecting the Units**

In [184]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [130]:
units = nutrition.astype(str).replace('[^a-zA-Z]','',regex=True)

In [131]:
units.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,g,,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Nuts, pecans",g,,g,g,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Eggplant, raw",g,,g,,,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g
"Teff, uncooked",g,,g,g,,mg,mg,,,mg,...,g,g,g,g,,,g,,,g
"Sherbet, orange",g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [132]:
units.mode()

Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [133]:
units = units.mode()

## **The `rename()` method**

In [162]:
df

Unnamed: 0,age,weight,height,1
0,12.0,,1.72,
1,,,,
2,14.0,83.2,1.91,
3,15.0,90.1,1.54,


In [136]:
df.rename(index={0:'Pikachu', 1: 'Andy'})

Unnamed: 0,age,weight,height
Pikachu,12,41.1,1.72
Andy,13,34.5,1.74
2,14,83.2,1.91
3,15,90.1,1.54


In [138]:
df.rename(columns={'weight':'weight (kg)'}, index={0:'Pikachu'})

Unnamed: 0,age,weight (kg),height
Pikachu,12,41.1,1.72
1,13,34.5,1.74
2,14,83.2,1.91
3,15,90.1,1.54


In [140]:
df.rename(mapper={'weight':'weight (kg)'}, axis=1)

Unnamed: 0,age,weight (kg),height
0,12,41.1,1.72
1,13,34.5,1.74
2,14,83.2,1.91
3,15,90.1,1.54


In [141]:
df.axes[1]

Index(['age', 'weight', 'height'], dtype='object')

## **DataFrame `dropna()`**

In [163]:
df

Unnamed: 0,age,weight,height,1
0,12.0,,1.72,
1,,,,
2,14.0,83.2,1.91,
3,15.0,90.1,1.54,


In [164]:
df.loc[1,:] = np.nan
df.iloc[0,1] = np.nan

In [165]:
df

Unnamed: 0,age,weight,height,1
0,12.0,,1.72,
1,,,,
2,14.0,83.2,1.91,
3,15.0,90.1,1.54,


In [166]:
df.dropna()

Unnamed: 0,age,weight,height,1


In [167]:
# Drop rows (axis=0) where there is any NaN
df.dropna(how='any', axis=0)

Unnamed: 0,age,weight,height,1


In [168]:
# Drop rows (axis=0) where all entries are NaN
df.dropna(how='all', axis=0)

Unnamed: 0,age,weight,height,1
0,12.0,,1.72,
2,14.0,83.2,1.91,
3,15.0,90.1,1.54,


In [169]:
# thresh -> we want rows that have at least thresh=3 non-null values
df.dropna(thresh=3, axis=0)

Unnamed: 0,age,weight,height,1
2,14.0,83.2,1.91,
3,15.0,90.1,1.54,


In [161]:
df.dropna(thresh=df.shape[1], axis=0)

Unnamed: 0,age,weight,height,1


In [170]:
df.shape

(4, 4)

In [171]:
df.dropna(axis=1)

0
1
2
3


In [174]:
df.dropna(how='all',axis=1, inplace=True)

## **`dropna()` with Subset**

`df.dropna(axis=0, subset=['gender'])` -> drop rows but only looks at gender
* `df.dropna()` -> removes columns or rows with missing values
* `subset` -> restricts or localizes the method application to specific orthogonal labels

In [175]:
df

Unnamed: 0,age,weight,height
0,12.0,,1.72
1,,,
2,14.0,83.2,1.91
3,15.0,90.1,1.54


In [176]:
df['gender'] = ['M', 'F', np.nan, 'F']

In [177]:
df

Unnamed: 0,age,weight,height,gender
0,12.0,,1.72,M
1,,,,F
2,14.0,83.2,1.91,
3,15.0,90.1,1.54,F


In [179]:
df.dropna() # df.dropna(axis=0, how='any')

Unnamed: 0,age,weight,height,gender
3,15.0,90.1,1.54,F


In [180]:
# the subset param

In [181]:
df.dropna(axis=0, how='any', subset=['gender'])

Unnamed: 0,age,weight,height,gender
0,12.0,,1.72,M
1,,,,F
3,15.0,90.1,1.54,F


In [182]:
df.dropna(axis=0, how='any', subset=['age'])

Unnamed: 0,age,weight,height,gender
0,12.0,,1.72,M
2,14.0,83.2,1.91,
3,15.0,90.1,1.54,F


In [183]:
df.dropna(axis=1, how='any', subset=[0,3])

Unnamed: 0,age,height,gender
0,12.0,1.72,M
1,,,F
2,14.0,1.91,
3,15.0,1.54,F


## **Part II: Merging Units with Columns Names**

In [185]:
nutrition.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [186]:
units

Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [188]:
for k in units:
    print(units[k][0])

g

g
g
mg
mg
mg
mcg
mcg
mg
mg
mg
mg
IU
mcg
mcg
mcg
mcg
mcg

mcg
mg
mg
IU
mg
mg
mcg
mg
mg
mg
mg
mg
mg
mg
mcg
mg
g
g
g
g
g
g
g
g

g
g
g
g
g
g
g
g
g
g
g
g
g
g






g
g
g
g
mg
g
g
mg
mg
g


In [190]:
units = units.replace('', np.nan).dropna(axis=1)

In [191]:
units

Unnamed: 0,serving_size,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,pantothenic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,g,g,mg,mg,mg,mcg,mcg,mg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


In [196]:
mapper = {k: k + "_" + units[k].at[0] for k in units}

In [197]:
mapper

{'serving_size': 'serving_size_g',
 'total_fat': 'total_fat_g',
 'saturated_fat': 'saturated_fat_g',
 'cholesterol': 'cholesterol_mg',
 'sodium': 'sodium_mg',
 'choline': 'choline_mg',
 'folate': 'folate_mcg',
 'folic_acid': 'folic_acid_mcg',
 'niacin': 'niacin_mg',
 'pantothenic_acid': 'pantothenic_acid_mg',
 'riboflavin': 'riboflavin_mg',
 'thiamin': 'thiamin_mg',
 'vitamin_a': 'vitamin_a_IU',
 'vitamin_a_rae': 'vitamin_a_rae_mcg',
 'carotene_alpha': 'carotene_alpha_mcg',
 'carotene_beta': 'carotene_beta_mcg',
 'cryptoxanthin_beta': 'cryptoxanthin_beta_mcg',
 'lutein_zeaxanthin': 'lutein_zeaxanthin_mcg',
 'vitamin_b12': 'vitamin_b12_mcg',
 'vitamin_b6': 'vitamin_b6_mg',
 'vitamin_c': 'vitamin_c_mg',
 'vitamin_d': 'vitamin_d_IU',
 'vitamin_e': 'vitamin_e_mg',
 'tocopherol_alpha': 'tocopherol_alpha_mg',
 'vitamin_k': 'vitamin_k_mcg',
 'calcium': 'calcium_mg',
 'copper': 'copper_mg',
 'irom': 'irom_mg',
 'magnesium': 'magnesium_mg',
 'manganese': 'manganese_mg',
 'phosphorous': 'phospho

In [199]:
nutrition.rename(columns=mapper, inplace=True)

In [201]:
nutrition.head()

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


## **Part III: Removing Units from Values**

In [203]:
nutrition.replace('[a-zA-Z]','',regex=True, inplace=True)

In [204]:
nutrition = nutrition.astype(float)

In [205]:
nutrition.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 8789 entries, Cornstarch to Beef, raw, all grades, trimmed to 0" fat, separable lean only, boneless, eye of round steak, round
Columns: 75 entries, serving_size_g to water_g
dtypes: float64(75)
memory usage: 5.1+ MB
