In [None]:
# load pandas
import pandas as pd
import numpy as np

In [None]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url)

# loading the columns of the dataset
city_mpg = df.city08
highway_mpg = df.highway08

  df = pd.read_csv(url)


## 6. Manipulation Methods

- the workhorse of pandas
- data processing and cleaning
- most methods discussed here manipulate series values but preserve the index

In [None]:
def gt20(val):
    return val > 20

In [None]:
# apply allows you to apply a function element-wise to every value
city_mpg.apply(gt20)

0        False
1        False
2         True
3        False
4        False
         ...  
41139    False
41140    False
41141    False
41142    False
41143    False
Name: city08, Length: 41144, dtype: bool

In [None]:
make = df.make
make.value_counts()

make
Chevrolet                      4003
Ford                           3371
Dodge                          2583
GMC                            2494
Toyota                         2071
                               ... 
Volga Associated Automobile       1
Panos                             1
Mahindra                          1
Excalibur Autos                   1
London Coach Co Inc               1
Name: count, Length: 136, dtype: int64

In [None]:
top5 = make.value_counts().index[:5]
def generalize_top5(val):
    if val in top5:
        return val
    return 'Other'

make.apply(generalize_top5)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [None]:
# isin method
make.isin(top5)

0        False
1        False
2         True
3         True
4        False
         ...  
41139    False
41140    False
41141    False
41142    False
41143    False
Name: make, Length: 41144, dtype: bool

In [None]:
# .where method
make.where(
    make.isin(top5),
    other='Other'
)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [None]:
# tilde ~ inverts the condition
make.where(
    ~make.isin(top5),
    other='Other'
)

0        Alfa Romeo
1           Ferrari
2             Other
3             Other
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

Missing Data

In [None]:
cyl = df.cylinders
cyl.isna().sum() # no of missing data

206

In [None]:
missing = cyl.isna() # missing mask
make.loc[missing] # makes with missing cylinder values

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: object

Seems like cylinder information is missing from cars that are electric because it has an electric engine, not a combustion engine, hence 0 cylinders

In [None]:
# use fillna method to specify a replacement for missing data
cyl[cyl.isna()]

cyl.fillna(0)

0         4.0
1        12.0
2         4.0
3         8.0
4         4.0
         ... 
41139     4.0
41140     4.0
41141     4.0
41142     4.0
41143     4.0
Name: cylinders, Length: 41144, dtype: float64

Possible fill options:

1. drop missing values: dropna()
2. forward fill: ffill()
3. backward fill: bfill()
4. fill missing value: fillna(data.mean()) -- in this case filling with mean value
5. interpolate: interpolate() -- handy if the data is ordered


In [None]:
# sort values
city_mpg.sort_values()

7901       6
34557      6
37161      6
21060      6
35887      6
        ... 
34563    138
34564    140
32599    150
31256    150
33423    150
Name: city08, Length: 41144, dtype: int64

In [None]:
# sort index
city_mpg.sort_values().sort_index()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [None]:
# drop duplicates
city_mpg.drop_duplicates()

0         19
1          9
2         23
3         10
4         17
        ... 
34364    127
34409    114
34564    140
34565    115
34566    104
Name: city08, Length: 105, dtype: int64

In [None]:
# replace data (map values to new values)
make.replace('Subaru', 'Sub')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4               Sub
            ...    
41139           Sub
41140           Sub
41141           Sub
41142           Sub
41143           Sub
Name: make, Length: 41144, dtype: object

Exercise 9.14

In [None]:
# ex 9.14

#q1
mean = city_mpg.mean()

def high_low(val):
    if val >= mean:
        return 'high'
    else: return 'low'

hl = city_mpg.apply(high_low)
hl

0        high
1         low
2        high
3         low
4         low
         ... 
41139    high
41140    high
41141     low
41142     low
41143     low
Name: city08, Length: 41144, dtype: object

In [None]:
#q2

conditions = [city_mpg >= mean]
choices = ['high']

hl = np.select(conditions, choices, default='low')
hl

array(['high', 'low', 'high', ..., 'low', 'low', 'low'], dtype='<U4')

In [None]:
#q3
import timeit

apply_time = timeit.timeit(lambda: city_mpg.apply(high_low), number=1000)
select_time = timeit.timeit(lambda: np.select(conditions, choices, default='low'), number=1000)

print(apply_time, select_time)

3.2580640999949537 0.12450910004554316


In [None]:
#q4
print(cyl.isna().sum())
cyl.fillna(cyl.median()).isna().sum()

206


0

In [None]:
#q5
low = city_mpg.quantile(.1)
high = city_mpg.quantile(.9)

print(low, high)
print(city_mpg.min(), city_mpg.max())

clip = city_mpg.clip(lower=low, upper=high)
print(clip.min(), clip.max())

13.0 24.0
6 150
13 24


In [None]:
#q6
top5 = make.value_counts().index[:5]
def generalize(val):
    if val in top5:
        return val
    else: return 'Others'


make.apply(generalize)

0        Others
1        Others
2         Dodge
3         Dodge
4        Others
          ...  
41139    Others
41140    Others
41141    Others
41142    Others
41143    Others
Name: make, Length: 41144, dtype: object

In [None]:
#q7
vc = make.value_counts()
top10 = vc.index[:10]

def generalize(val):
    if val in top10:
        return val
    return 'Others'

make.apply(generalize)

0        Others
1        Others
2         Dodge
3         Dodge
4        Others
          ...  
41139    Others
41140    Others
41141    Others
41142    Others
41143    Others
Name: make, Length: 41144, dtype: object

In [None]:
#q8

vc = make.value_counts()

def generalize_n(val, n):

    top_n = vc.index[:n]

    if val in top_n:
        return val
    return 'Others'

def generalize_n_set(val, n):

    top_n = set(vc.index[:n])

    if val in top_n:
        return val
    return 'Others'

t1 = timeit.timeit(lambda: make.apply(generalize_n, n=10), number=10)
t2 = timeit.timeit(lambda: make.apply(generalize_n_set, n=10), number=10)

print(t1, t2)
# set is faster

1.6970072999829426 0.8374150999588892


In [None]:
#q9
pd.cut(city_mpg, 10)

0        (5.856, 20.4]
1        (5.856, 20.4]
2         (20.4, 34.8]
3        (5.856, 20.4]
4        (5.856, 20.4]
             ...      
41139    (5.856, 20.4]
41140    (5.856, 20.4]
41141    (5.856, 20.4]
41142    (5.856, 20.4]
41143    (5.856, 20.4]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.856, 20.4] < (20.4, 34.8] < (34.8, 49.2] < (49.2, 63.6] ... (92.4, 106.8] < (106.8, 121.2] < (121.2, 135.6] < (135.6, 150.0]]

In [None]:
#q10
pd.qcut(city_mpg, 10)

0         (18.0, 20.0]
1        (5.999, 13.0]
2         (21.0, 24.0]
3        (5.999, 13.0]
4         (16.0, 17.0]
             ...      
41139     (18.0, 20.0]
41140     (18.0, 20.0]
41141     (17.0, 18.0]
41142     (17.0, 18.0]
41143     (15.0, 16.0]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.999, 13.0] < (13.0, 14.0] < (14.0, 15.0] < (15.0, 16.0] ... (18.0, 20.0] < (20.0, 21.0] < (21.0, 24.0] < (24.0, 150.0]]