In [2]:
# load pandas
import pandas as pd
import numpy as np

url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url)

# loading the columns of the dataset
city_mpg = df.city08
highway_mpg = df.highway08

## 8. String Manipulation

- string type = object
- pandas 1.0 introduced new 'string' type, also supports missing values that are not NaN

In [135]:
make # dtype = object

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [137]:
# convert to string
make.astype('string')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string

In [138]:
# for low cardinality string columns, consider 'category' type
# u will have access to same string manipulation methods, advantage = memory savings + performance improvements
# operations need to be done only on the individual categories and not each value in the series
make.astype('category')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

- object, 'string', and 'category' types have a .str accessor that provides string manipulation methods

In [140]:
make.str.lower()

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: object

In [141]:
'Alfa Romeo'.find('R')

5

In [142]:
make.str.find('A')

0        0
1       -1
2       -1
3       -1
4       -1
        ..
41139   -1
41140   -1
41141   -1
41142   -1
41143   -1
Name: make, Length: 41144, dtype: int64

In [144]:
make.str.find('f') # location of the first 'f'

0        2
1       -1
2       -1
3       -1
4       -1
        ..
41139   -1
41140   -1
41141   -1
41142   -1
41143   -1
Name: make, Length: 41144, dtype: int64

In [145]:
make.str.startswith('Alfa')

0         True
1        False
2        False
3        False
4        False
         ...  
41139    False
41140    False
41141    False
41142    False
41143    False
Name: make, Length: 41144, dtype: bool

In [149]:
# searching using regex
make.str.extract(r'([^a-z A-Z])', expand=False).value_counts()

make
-    1727
.      46
,       9
Name: count, dtype: int64

In [153]:
age = pd.Series(['0-10', '11-15', '11-15', '61-65', '46-50'])
age.str.split('-',expand=True).iloc[:,0].astype(int) # get the first column

0     0
1    11
2    11
3    61
4    46
Name: 0, dtype: int32

In [156]:
# replacing values
print(make.str.replace('A', 'B')) # replaces all 'A' with 'B'
print(make.replace('A', 'B')) # replaces whole string ('A')

0        Blfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object
0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object


In [157]:
# you can replace values using a dictionary
replace_dict = {'Alfa Romeo': 'Haha'}

make.replace(replace_dict)

0           Haha
1        Ferrari
2          Dodge
3          Dodge
4         Subaru
          ...   
41139     Subaru
41140     Subaru
41141     Subaru
41142     Subaru
41143     Subaru
Name: make, Length: 41144, dtype: object

Exercise 11.9

In [159]:
#q1
make.str.lower()

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: object

In [164]:
#q2
make.str[0]

0        A
1        F
2        D
3        D
4        S
        ..
41139    S
41140    S
41141    S
41142    S
41143    S
Name: make, Length: 41144, dtype: object

In [166]:
#q3
make.str[-3:]

0        meo
1        ari
2        dge
3        dge
4        aru
        ... 
41139    aru
41140    aru
41141    aru
41142    aru
41143    aru
Name: make, Length: 41144, dtype: object

In [None]:
#q4
make.str.isnumeric()