# Chapter 11

In [1]:
import pandas as pd

url = "https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip"
df = pd.read_csv(url, dtype_backend="pyarrow", engine="pyarrow")

cyl = df.cylinders

In [2]:
(cyl
 .isna()
 .sum()
)

206

In [3]:
missing = cyl.isna()
df.make.loc[missing]

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: string[pyarrow]

What does interpolation do in cases where the index isn't simply a progression of integers? Does its behavior change?

In [4]:
index = ["fish", "bird", "cat", "rat", "dog", "snake"]
vals = [32, 40, None, 42, 39, 32]
s = pd.Series(vals, index=index, dtype="float[pyarrow]")
s

fish     32.0
bird     40.0
cat      <NA>
rat      42.0
dog      39.0
snake    32.0
dtype: float[pyarrow]

In [5]:
s.interpolate()

fish     32.0
bird     40.0
cat      41.0
rat      42.0
dog      39.0
snake    32.0
dtype: float[pyarrow]

In [6]:
city_mpg = df.city08

city_mpg.loc[:446]

0      19
1       9
2      23
3      10
4      17
       ..
442    15
443    15
444    15
445    15
446    31
Name: city08, Length: 447, dtype: int64[pyarrow]

In [7]:
city_mpg.loc[:446].clip(lower=city_mpg.quantile(0.5), upper=city_mpg.quantile(0.95))

0      19
1      17
2      23
3      17
4      17
       ..
442    17
443    17
444    17
445    17
446    27
Name: city08, Length: 447, dtype: int64[pyarrow]

In [8]:
make = df.make
make.replace("Subaru", "スバル")

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4               スバル
            ...    
41139           スバル
41140           スバル
41141           スバル
41142           スバル
41143           スバル
Name: make, Length: 41144, dtype: string[pyarrow]

## Exercises

In [9]:
#1:
city_mpg.apply(lambda x: "high" if x >= city_mpg.mean() else "low")

0        high
1         low
2        high
3         low
4         low
         ... 
41139    high
41140    high
41141     low
41142     low
41143     low
Name: city08, Length: 41144, dtype: object

In [10]:
#2:
m = city_mpg.mean()
city_mpg.case_when([(city_mpg.ge(m), "high"), (city_mpg.lt(m), "low")])

0        high
1         low
2        high
3         low
4         low
         ... 
41139    high
41140    high
41141     low
41142     low
41143     low
Name: city08, Length: 41144, dtype: object

In [11]:
%%timeit
city_mpg.apply(lambda x: "high" if x >= city_mpg.mean() else "low")

1.15 s ± 6.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%timeit

m = city_mpg.mean()
city_mpg.case_when([(city_mpg.ge(m), "high"), (city_mpg.lt(m), "low")])

1.56 ms ± 11.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [13]:
#3:
1_110 - 1.53

1108.47

In [14]:
#4:
city_mpg.fillna(city_mpg.median())

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [15]:
#5:
city_mpg.clip(lower=city_mpg.quantile(0.1), upper=city_mpg.quantile(0.9))

0        19
1        13
2        23
3        13
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [16]:
#6:
make_cat = make.astype("category")
top5 = make_cat.value_counts().index[:5]
make_cat = make_cat.cat.add_categories("Other")
make_cat.where(make_cat.isin(top5), "Other").cat.remove_unused_categories()

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: category
Categories (6, object): ['Chevrolet', 'Dodge', 'Ford', 'GMC', 'Toyota', 'Other']

In [17]:
#7:
make_cat = make.astype("category")
top10 = make_cat.value_counts().index[:10]
make_cat = make_cat.cat.add_categories("Other")
make_cat.where(make_cat.isin(top10), "Other").cat.remove_unused_categories()

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: category
Categories (11, object): ['BMW', 'Chevrolet', 'Dodge', 'Ford', ..., 'Nissan', 'Toyota', 'Volkswagen', 'Other']

In [18]:
#8:
def replace_not_in_top(cat_ser, n):
    top_n = cat_ser.value_counts().index[:n]
    cat_ser = cat_ser.cat.add_categories("Other")
    return cat_ser.where(cat_ser.isin(top_n), "Other").cat.remove_unused_categories()

In [19]:
#9:
pd.cut(city_mpg, 10)

0        (5.856, 20.4]
1        (5.856, 20.4]
2         (20.4, 34.8]
3        (5.856, 20.4]
4        (5.856, 20.4]
             ...      
41139    (5.856, 20.4]
41140    (5.856, 20.4]
41141    (5.856, 20.4]
41142    (5.856, 20.4]
41143    (5.856, 20.4]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.856, 20.4] < (20.4, 34.8] < (34.8, 49.2] < (49.2, 63.6] ... (92.4, 106.8] < (106.8, 121.2] < (121.2, 135.6] < (135.6, 150.0]]

In [20]:
#10:
pd.qcut(city_mpg, 10)

0         (18.0, 20.0]
1        (5.999, 13.0]
2         (21.0, 24.0]
3        (5.999, 13.0]
4         (16.0, 17.0]
             ...      
41139     (18.0, 20.0]
41140     (18.0, 20.0]
41141     (17.0, 18.0]
41142     (17.0, 18.0]
41143     (15.0, 16.0]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.999, 13.0] < (13.0, 14.0] < (14.0, 15.0] < (15.0, 16.0] ... (18.0, 20.0] < (20.0, 21.0] < (21.0, 24.0] < (24.0, 150.0]]