Transforming Data Using a Function or Mapping

In [7]:
import pandas as pd
import numpy as np
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',   
                              'Pastrami', 'corned beef', 'Bacon',   
                              'pastrami', 'honey ham', 'nova lox'],   
                              'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
#the type of animal that each food came from
meat_to_animal = {'bacon': 'pig',  
                  'pulled pork': 'pig',  
                  'pastrami': 'cow',  
                  'corned beef': 'cow',  
                  'honey ham': 'pig',  
                  'nova lox': 'salmon'}
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
print(data)
print(data['food'].map(lambda x: meat_to_animal[x.lower()]))
#Replacing Values
data.replace('nova lox', np.nan)

          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,,6.0,salmon


Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects. You can also modify the axes in-place without creating a new data structure.

In [35]:
import numpy as np
data = pd.DataFrame(np.arange(12).reshape((3, 4)),   
                    index=['Ohio', 'Colorado', 'New York'],   
                    columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper()
data.index.map(transform)
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [36]:
#modifying the DataFrame in-place
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


If you want to create a transformed version of a dataset without modifying the original, a useful method is rename

In [37]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


rename can be used in conjunction with a dict-like object providing new values for a subset of the axis labels

In [38]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


Should you wish to modify a dataset in-place, pass inplace=True

In [39]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


Discretization and Binning

In [40]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100] #divide these into bins of 18 to 25, 26 to 35, 36 to 60, and 61 and older
cats = pd.cut(ages, bins) #returns a special Categorical object
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [41]:
cats.codes #index location of the bin

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [42]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [43]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

Consistent with mathematical notation for intervals, a parenthesis means that the side is open, while the square bracket means it is closed (inclusive). You can change which side is closed by passing right=False

In [44]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

You can also pass your own bin names by passing a list or array to the labels option

In [45]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

If you pass an integer number of bins to cut instead of explicit bin edges, it will compute      equal-length bins based on the minimum and maximum values in the data

In [46]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.73, 0.93], (0.54, 0.73], (0.35, 0.54], (0.73, 0.93], (0.35, 0.54], ..., (0.16, 0.35], (0.73, 0.93], (0.16, 0.35], (0.35, 0.54], (0.54, 0.73]]
Length: 20
Categories (4, interval[float64]): [(0.16, 0.35] < (0.35, 0.54] < (0.54, 0.73] < (0.73, 0.93]]

A closely related function, qcut, bins the data based on sample quantiles. Depending on the      distribution of the data, using cut will not usually result in each bin having the same number of data points. Since qcut uses sample quantiles instead, by definition you will obtain roughly equal-size      bins

In [47]:
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats

[(-3.263, -0.598], (0.696, 3.219], (-3.263, -0.598], (0.0387, 0.696], (0.0387, 0.696], ..., (0.696, 3.219], (-0.598, 0.0387], (-0.598, 0.0387], (-0.598, 0.0387], (-0.598, 0.0387]]
Length: 1000
Categories (4, interval[float64]): [(-3.263, -0.598] < (-0.598, 0.0387] < (0.0387, 0.696] < (0.696, 3.219]]

In [48]:
pd.value_counts(cats)

(0.696, 3.219]      250
(0.0387, 0.696]     250
(-0.598, 0.0387]    250
(-3.263, -0.598]    250
dtype: int64

Detecting and Filtering Outliers

In [49]:
data = pd.DataFrame(np.random.randn(1000, 4))
#Suppose you wanted to find values in one of the columns exceeding 3 in absolute value
col = data[2]
col[np.abs(col) > 3]

343    3.373898
793   -3.458345
924    3.266805
Name: 2, dtype: float64

In [50]:
#To select all rows having a value exceeding 3 or –3, you can use  the any method on a boolean DataFrame
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
268,-1.383676,3.203232,0.61196,0.054681
343,-0.373712,0.763817,3.373898,0.215876
358,3.785225,0.964449,0.245216,0.064061
384,-0.459577,3.162779,0.146343,-0.319602
444,-3.887485,-1.699446,0.852773,0.340675
793,0.155216,0.026443,-3.458345,-0.17493
805,-1.530695,1.717046,0.689591,-3.297464
833,-1.929881,0.303239,-0.626877,3.093361
924,-0.378871,0.565909,3.266805,-1.632484
991,-1.086323,-2.424779,-0.553797,-3.967071


Values can be set based on these criteria. Here is code to cap values outside the interval –3 to 3

In [51]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.016324,0.012698,0.035907,-0.049975
std,0.968398,0.992285,1.019445,0.988817
min,-3.0,-2.784927,-3.0,-3.0
25%,-0.667595,-0.664955,-0.636969,-0.707024
50%,-0.011945,0.015601,0.070015,-0.004472
75%,0.692635,0.711759,0.709384,0.610469
max,3.0,3.0,3.0,3.0


The statement np.sign(data) produces 1 and –1 values based on whether the values in data are positive or negative

In [52]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,-1.0,-1.0,1.0,-1.0
2,-1.0,-1.0,1.0,-1.0
3,-1.0,1.0,1.0,-1.0
4,-1.0,1.0,-1.0,-1.0


Permutation and Random Sampling

In [53]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19


In [54]:
#To select a random subset without replacement, you can use the sample method on Series and DataFrame
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11


Computing Indicator/Dummy Variables

Another type of transformation for statistical modeling or machine learning applications is converting a categorical variable into a “dummy” or “indicator” matrix. If a column in a DataFrame has k distinct values, you would derive a matrix or DataFrame with k columns containing all 1s and 0s. pandas has a get_dummies function for doing this

In [55]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],   
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [56]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


n some cases, you may want to add a prefix to the columns in the indicator DataFrame, which can then be merged with the other data.  get_dummies has a prefix argument for doing this

In [57]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


If a row in a DataFrame belongs to multiple categories, things are      a bit more complicated. Let’s look at the MovieLens 1M dataset

In [58]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::',   
                       header=None, names=mnames)
movies[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Adding indicator variables for each genre requires a little bit of wrangling. First, we extract the list of unique genres in the dataset

In [59]:
all_genres = []
for x in movies.genres:   
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

One way to construct the indicator DataFrame is to start with a DataFrame of all zeros

In [60]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)

Now, iterate through each movie and set entries in each row of dummies to 1. To do this, we use the      dummies.columns to compute the column indices for each genre

In [61]:
gen = movies.genres[0]
gen.split('|')
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

Then, we can use .iloc to set values based on these indices

In [62]:
for i, gen in enumerate(movies.genres): 
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

Then, as before, you can combine this with movies:

In [63]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

A useful recipe for statistical applications is to combine  get_dummies with a discretization      function like cut

In [64]:
np.random.seed(12345)
values = np.random.rand(10)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0
