In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Handling Missing Data

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

# Nan ( Not a Number) - NA (Not availabe )

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

When cleaning up data for analysis, it is often important to do analysis on the missing data itself to identify data collection problems or potential biases in the data caused by missing data

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# Filtering Out Missing Data

In [7]:
from numpy import nan as NA

In [8]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [9]:
data.dropna() # equivalent data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

With DataFrame objects, dropna by default drops any row containing a missing value

In [10]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])

In [11]:
cleaned = data.dropna()

In [12]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Passing how='all' will only drop rows that are all NA:

In [14]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


To drop columns in the same way, pass axis=1:

In [15]:
data[4] = NA

In [16]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
data.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


# keep only rows containing a certain number of observations

A related way to filter out DataFrame rows tends to concern time series data. 
- thresh argument

In [18]:
df = pd.DataFrame(np.random.randn(7, 3))

In [19]:
df.iloc[:4, 1] = NA

In [20]:
df.iloc[:2, 2] = NA

In [21]:
df

Unnamed: 0,0,1,2
0,0.281319,,
1,0.2943,,
2,-0.043846,,-1.036343
3,1.140109,,0.223232
4,-0.335847,0.884438,-0.495921
5,0.264156,-1.656297,0.873744
6,0.245779,-0.247939,0.808261


In [22]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.335847,0.884438,-0.495921
5,0.264156,-1.656297,0.873744
6,0.245779,-0.247939,0.808261


In [23]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.043846,,-1.036343
3,1.140109,,0.223232
4,-0.335847,0.884438,-0.495921
5,0.264156,-1.656297,0.873744
6,0.245779,-0.247939,0.808261


# Filling In Missing Data

In [24]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.281319,0.0,0.0
1,0.2943,0.0,0.0
2,-0.043846,0.0,-1.036343
3,1.140109,0.0,0.223232
4,-0.335847,0.884438,-0.495921
5,0.264156,-1.656297,0.873744
6,0.245779,-0.247939,0.808261


Calling fillna with a dict, you can use a different fill value for each column

In [25]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.281319,0.5,0.0
1,0.2943,0.5,0.0
2,-0.043846,0.5,-1.036343
3,1.140109,0.5,0.223232
4,-0.335847,0.884438,-0.495921
5,0.264156,-1.656297,0.873744
6,0.245779,-0.247939,0.808261


In [26]:
_ = df.fillna(0, inplace=True)

In [27]:
df

Unnamed: 0,0,1,2
0,0.281319,0.0,0.0
1,0.2943,0.0,0.0
2,-0.043846,0.0,-1.036343
3,1.140109,0.0,0.223232
4,-0.335847,0.884438,-0.495921
5,0.264156,-1.656297,0.873744
6,0.245779,-0.247939,0.808261


In [28]:
df = pd.DataFrame(np.random.randn(6, 3))

In [29]:
df.iloc[2:, 1] = NA

In [30]:
df.iloc[4:, 2] = NA

In [31]:
df

Unnamed: 0,0,1,2
0,-0.450082,-0.980078,-0.279596
1,1.588831,0.203643,-0.66211
2,0.654914,,2.363929
3,-0.654482,,-0.437989
4,-0.14654,,
5,-0.469121,,


In [32]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.450082,-0.980078,-0.279596
1,1.588831,0.203643,-0.66211
2,0.654914,0.203643,2.363929
3,-0.654482,0.203643,-0.437989
4,-0.14654,0.203643,-0.437989
5,-0.469121,0.203643,-0.437989


In [33]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.450082,-0.980078,-0.279596
1,1.588831,0.203643,-0.66211
2,0.654914,0.203643,2.363929
3,-0.654482,0.203643,-0.437989
4,-0.14654,,-0.437989
5,-0.469121,,-0.437989


In [34]:
data = pd.Series([1., NA, 3.5, NA, 7])

In [35]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# Data Transformation

# 1 - Removing Duplicates

Duplicate rows may be found in a DataFrame for any number of reasons. Here is an example

In [36]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                    'k2': [1, 1, 2, 3, 3, 4, 4]})

In [37]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [38]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [39]:
# Drop Duplicates
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Both of these methods by default consider all of the columns;

- alternatively, you can specify any subset of them to detect duplicates
- Suppose we had an additional column of values and wanted to filter duplicates only based on the 'k1' column

In [40]:
data['v1'] = range(7)

In [41]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [42]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


duplicated and drop_duplicates by default keep the first observed value combina‐ tion. Passing keep='last' will return the last one

In [43]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


# 2 -Transforming Data Using a Function or Mapping

For many datasets, you may wish to perform some transformation based on the val‐ ues in an array, Series, or column in a DataFrame

In [44]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'Pastrami', 'corned beef', 'Bacon',
                             'Pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [45]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,Pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


Suppose you wanted to add a column indicating the type of animal that each food came from

In [55]:
meat_to_animal = {
    'bacon':'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'pig',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

The map method on a Series accepts a function or dict-like object containing a map‐ ping, but here we have a small problem in that some of the meats are capitalized and others are not. Thus, we need to convert each value to lowercase using the str.lower Series method

In [53]:
data['animal'] = data['food'].str.lower().map(meat_to_animal)

In [54]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,pig
5,Bacon,8.0,pig
6,Pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [56]:
# We could also have passed a lambda function as follows:
data['animal'] = data['food'].map(lambda x: meat_to_animal[x.lower()])

In [57]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,pig
5,Bacon,8.0,pig
6,Pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


Using map is a convenient way to perform element-wise transformations and other data cleaning–related operations.

# 3 - Replacing Values

In [60]:
data = pd.Series([1, -999., 2., -999., -1000., 3.])

In [61]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [62]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

If you want to replace multiple values at once, you instead pass a list and then the substitute value

In [63]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

To use a different replacement for each value, pass a list of substitutes:

In [64]:
data.replace([-999, -1000], [np.nan, 0])# could be also data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# 4 - Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or map‐ ping of some form to produce new, differently labeled objects

In [103]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                   index=['Ohio', 'Colorado', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [104]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [105]:
transform = lambda x: x[:4].upper()

In [106]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

You can assign to index, modifying the DataFrame in-place:

In [107]:
data.index = data.index.map(transform)

In [108]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


If you want to create a transformed version of a dataset without modifying the origi‐ nal, a useful method is rename:

In [109]:
data = data.rename(index=str.title, columns=str.upper)

In [110]:
data.index = data.index.map(transform)

In [111]:
data

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [112]:
data.rename(index={'OHIO': 'INDIANA'},
           columns={'THREE': 'peekaboo'})

Unnamed: 0,ONE,TWO,peekaboo,FOUR
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [113]:
#data.rename(index={'Ohio': 'INDIANA', 'colo': 'COLORADO', 'New York': 'NY'},
#           columns={'THREE': 'peekaboo'})

In [114]:
data

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


Should you wish to modify a dataset in-place, pass inplace=True:

In [119]:
data.rename(index={'OHIO': 'INDIANA', 'COLO': 'COLUMBUS'}, inplace=True)

In [120]:
data

Unnamed: 0,ONE,TWO,THREE,FOUR
INDIANA,0,1,2,3
COLUMBUS,4,5,6,7
NEW,8,9,10,11


# 5 - Discretization and Binning

In [121]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [122]:
bins = [18, 25, 35, 60, 100]

In [123]:
cats = pd.cut(ages, bins)

In [124]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [125]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [126]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [127]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

You can change which side is closed by passing right=False:

In [128]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

You can also pass your own bin names by passing a list or array to the labels option:

In [129]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [131]:
main_cat = pd.cut(ages, bins, labels=group_names)

In [132]:
main_cat

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [133]:
main_cat.categories

Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')

In [134]:
pd.value_counts(main_cat)

Youth         5
MiddleAged    3
YoungAdult    3
Senior        1
dtype: int64

# 6 - Detecting and Filtering Outliers

Filtering or transforming outliers is largely a matter of applying array operations.

In [140]:
data = pd.DataFrame(np.random.randn(1000, 4))

In [141]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.02406,-0.051872,-0.032175,-0.022896
std,1.026669,0.964813,0.989047,0.980504
min,-3.622238,-3.076936,-3.308342,-3.132424
25%,-0.7278,-0.692903,-0.707779,-0.714891
50%,-0.006102,-0.064761,-0.060624,-0.021898
75%,0.653032,0.607041,0.664058,0.614206
max,3.095921,3.617028,2.652857,3.083208


Suppose you wanted to find values in one of the columns exceeding 3 in absolute value:

In [142]:
col = data[2]

In [143]:
col[np.abs(col) > 3 ]

182   -3.049068
616   -3.070584
685   -3.308342
Name: 2, dtype: float64

To select all rows having a value exceeding 3 or –3, you can use the any method on a boolean DataFrame:

In [144]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
86,-1.366302,3.617028,-0.551225,-0.602675
115,0.213234,0.566003,-2.179026,-3.119776
118,-0.667666,3.063248,1.803496,0.703642
182,1.335266,-1.190504,-3.049068,0.664777
250,-3.622238,-0.74325,-0.785503,0.899024
297,-3.49264,-0.44498,1.555541,-0.664204
586,-0.488976,-1.706084,-2.191964,-3.058276
603,0.229119,0.637276,0.270643,3.083208
616,0.191352,1.19328,-3.070584,-0.816593
653,-0.942732,3.537073,0.269749,0.0171


# 7 - Permutation and Random Sampling

In [145]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [146]:
sampler = np.random.permutation(5)

In [147]:
sampler

array([1, 3, 2, 4, 0])

In [148]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [149]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3


To select a random subset without replacement, you can use the sample method on Series and DataFrame:

In [150]:
df.sample(3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
0,0,1,2,3


# 8 - Computing Indicator/Dummy Variables

Another type of transformation for statistical modeling or machine learning applica‐ tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a column in a DataFrame has k distinct values, you would derive a matrix or Data‐ Frame with k columns containing all 1s and 0s

In [154]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})

In [155]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [156]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In some cases, you may want to add a prefix to the columns in the indicator Data‐ Frame, which can then be merged with the other data. get_dummies has a prefix argu‐ ment for doing this

In [157]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [158]:
df_with_dummy = df[['data1']].join(dummies)

In [159]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [5]:
!ls "../data/raw/movielens/"

README.txt        genome-tags.csv   movies.csv        tags.csv
genome-scores.csv links.csv         ratings.csv


In [6]:
!head -2 "../data/raw/movielens/movies.csv"

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [2]:
mnames = ['movie_id', 'title', 'genres']

In [3]:
movies = pd.read_csv('../data/raw/movielens/movies.csv', sep=',', header=None, names=mnames)

In [4]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,movieId,title,genres
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,2,Jumanji (1995),Adventure|Children|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,5,Father of the Bride Part II (1995),Comedy
6,6,Heat (1995),Action|Crime|Thriller
7,7,Sabrina (1995),Comedy|Romance
8,8,Tom and Huck (1995),Adventure|Children
9,9,Sudden Death (1995),Action


In [5]:
all_genres = []

In [6]:
for x in movies['genres'][1:]:
    all_genres.extend(x.split('|'))

In [7]:
all_genres[:10]

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Adventure',
 'Children',
 'Fantasy',
 'Comedy',
 'Romance']

In [8]:
genres = pd.unique(all_genres)

In [9]:
genres

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

One way to construct the indicator DataFrame is to start with a DataFrame of all zeros:

In [10]:
zero_matrix = np.zeros((len(movies), len(genres)))

In [11]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

Now, iterate through each movie and set entries in each row of dummies to 1. To do this, we use the dummies.columns to compute the column indices for each genre

In [12]:
gen = movies.genres[1]

In [13]:
gen

'Adventure|Animation|Children|Comedy|Fantasy'

In [14]:
gen.split('|')

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

In [15]:
dummies.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2, 3, 4])

Then, we can use .iloc to set values based on these indices:

In [17]:
for i, gen in enumerate(movies['genres']):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [18]:
dummies.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Then, as before, you can combine this with movies:

In [19]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [21]:
movies_windic.iloc[2]

movie_id                                             2
title                                   Jumanji (1995)
genres                      Adventure|Children|Fantasy
Genre_Adventure                                      1
Genre_Animation                                      0
Genre_Children                                       1
Genre_Comedy                                         0
Genre_Fantasy                                        1
Genre_Romance                                        0
Genre_Drama                                          0
Genre_Action                                         0
Genre_Crime                                          0
Genre_Thriller                                       0
Genre_Horror                                         0
Genre_Mystery                                        0
Genre_Sci-Fi                                         0
Genre_IMAX                                           0
Genre_Documentary                                    0
Genre_War 

# 9 - String Manipulation

In [22]:
val = 'a,b, guido'

In [23]:
val.split(',')

['a', 'b', ' guido']

split is often combined with strip to trim whitespace (including line breaks):

In [24]:
pieces = [x.strip() for x in val.split(',')]

In [25]:
pieces

['a', 'b', 'guido']

In [26]:
first, second, third = pieces

In [27]:
first + '::' + second + '::' + third

'a::b::guido'

In [28]:
# OR
'::'.join(pieces)

'a::b::guido'

In [29]:
'guido' in val

True

In [30]:
val.find(':')

-1

Note the difference between find and index is that index raises an exception if the
string isn’t found (versus returning –1):

In [31]:
val.index(':')

ValueError: substring not found

Relatedly, count returns the number of occurrences of a particular substring:

In [32]:
val.count(',')

2

In [33]:
val.replace(',', '::')

'a::b:: guido'

In [35]:
val.replace(',','')

'ab guido'

- count : Return the number of non-overlapping occurrences of substring in the string.
- endswith : ReturnsTrueif string ends with suffix
- startswith : ReturnsTrueif string starts with prefix.
- join : Use string as delimiter for concatenating a sequence of other strings.
- index: Return position of first character in substring if found in the string; raises ValueError if not found.
- find: Return position of first character of first occurrence of substring in the string; like index, but returns (–1)if not found.
- rfind: Return position of first character of last occurrence of substring in the string; returns –1 if not found
- replace: Replace occurrences of string with another string.
- strip,: Trim whitespace, including newlines; equivalent to x.strip() (and rstrip, lstrip, respectively) for each element
- rstrip
- lstrip
- split : Break string into list of substrings using passed delimiter.
- lower
- upper
- casefold: Convert characters to lowercase, and convert any region-specific variable character combinations to a common comparable form.
- ljust,: Left justify or right justify, respectively; pad opposite side of string with spaces (or some other fill character) to return a string with a minimum width.
- rjust

# 10 - Regular Expressions

Regular expressions provide a flexible way to search or match (often more complex) string patterns in text. The re module functions fall into three categories: pattern matching, substitution, and splitting

In [42]:
import re

In [43]:
text = "foo  bar\t baz  \tqux"

In [44]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

If, instead, you wanted to get a list of all patterns matching the regex, you can use the
findall method:

In [46]:
re.findall('\s+',text)

['  ', '\t ', '  \t']

To avoid unwanted escaping with \ in a regular expression, use raw
string literals like r'C:\x' instead of the equivalent 'C:\\x'.

Creating a regex object with re.compile is highly recommended if you intend to apply the same expression to many strings; doing so will save CPU cycles.
- match and search are closely related to findall
- findall returns all matches in a string
- search returns only the first match
- match only matches at the beginning of the string

In [51]:
texte = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [55]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [56]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [57]:
regex.findall(texte)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [60]:
# search returns a special match object for the first email address in the text

In [61]:
m = regex.search(texte)

In [62]:
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [63]:
texte[m.start():m.end()]

'dave@google.com'

regex.match returns None, as it only will match if the pattern occurs at the start of the
string:

In [64]:
print(regex.match(text))

None


In [67]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



Suppose you wanted to find email addresses and simultaneously segment each address into its three components: username, domain name, and domain suffix. To do this, put parentheses around the parts of the pattern to segment:

In [68]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [69]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [70]:
m = regex.match('wesm@bright.net')

In [72]:
m.groups()

('wesm', 'bright', 'net')

findall returns a list of tuples when the pattern has groups:

In [73]:
regex.findall(texte)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [74]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', texte))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



# 11 - Vectorized String Functions in pandas

In [75]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
       'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [76]:
data = pd.Series(data)

In [77]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [78]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [79]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [80]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [81]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

There are a couple of ways to do vectorized element retrieval. Either use str.get or index into the str attribute:

In [82]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

In [83]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

To access elements in the embedded lists, we can pass an index to either of these functions:

In [84]:
matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [85]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object