# Data Formating

![gif](imgs/DF003.gif)

## Import

In [1]:
import pandas as pd
import numpy as np

## Data formating with function or view (VLOOKUP)

In [2]:
df = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'pastrami', 'beef', 
                            'Bacon', 'pastrami', 'honey', 'nova'], 
                   'ounces': [4, 3, 12, 6, 7.5, 8 ,3, 5, 6]})
df

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey,5.0
8,nova,6.0


In [3]:
dfmeat = {'bacon': 'pig', 
          'pulled pork': 'pig', 
          'pastrami': 'cow', 
          'beef': 'cow', 
          'honey': 'pig', 
          'nova': 'salmon'}

### oops

In [4]:
df['animal'] = df['food'].map(dfmeat)
df

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,beef,7.5,cow
5,Bacon,8.0,
6,pastrami,3.0,cow
7,honey,5.0,pig
8,nova,6.0,salmon


### right mapping

In [5]:
df['animal'] = df['food'].map(str.lower).map(dfmeat)
df

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey,5.0,pig
8,nova,6.0,salmon


In [6]:
dfnew = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'pastrami', 'beef', 
                            'Bacon', 'pastrami', 'honey', 'nova'], 
                   'ounces': [4, 3, 12, 6, 7.5, 8 ,3, 5, 6]})
dfnew

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey,5.0
8,nova,6.0


### with lambda

In [7]:
dfnew['animal'] = dfnew['food'].map(lambda x: dfmeat[x.lower()])
dfnew

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey,5.0,pig
8,nova,6.0,salmon


## Value replace

In [8]:
s = pd.Series([1, -999, 2, -999, -1000, 3])
s

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

In [9]:
s.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [10]:
s.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [11]:
s.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [12]:
s.replace({-999: 0, -1000: 0.})

0    1.0
1    0.0
2    2.0
3    0.0
4    0.0
5    3.0
dtype: float64

## Rename axis indexes

In [13]:
df = pd.DataFrame(np.arange(12).reshape((3,4)), 
                  index=['ohio', 'colorado', 'new york'], 
                  columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
new york,8,9,10,11


In [14]:
df.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [15]:
df.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [16]:
df.rename(index={'ohio': 'indiana'}, columns={'three': 'pikaboo'})

Unnamed: 0,one,two,pikaboo,four
indiana,0,1,2,3
colorado,4,5,6,7
new york,8,9,10,11


## Discretting and separating

### first example

In [17]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]

In [18]:
bins = [18,25,35,60,100]

In [19]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [20]:
cats[8]

Interval(60, 100, closed='right')

In [21]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [22]:
cats.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(18, 25]",5,0.416667
"(25, 35]",3,0.25
"(35, 60]",3,0.25
"(60, 100]",1,0.083333


In [23]:
cats.view()

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

### second example

In [24]:
data = np.arange(22)
data

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21])

In [25]:
pd.cut(data, 3)

[(-0.021, 7.0], (-0.021, 7.0], (-0.021, 7.0], (-0.021, 7.0], (-0.021, 7.0], ..., (14.0, 21.0], (14.0, 21.0], (14.0, 21.0], (14.0, 21.0], (14.0, 21.0]]
Length: 22
Categories (3, interval[float64]): [(-0.021, 7.0] < (7.0, 14.0] < (14.0, 21.0]]

In [26]:
pd.cut(data, 3, precision=0)

[(-0.0, 7.0], (-0.0, 7.0], (-0.0, 7.0], (-0.0, 7.0], (-0.0, 7.0], ..., (14.0, 21.0], (14.0, 21.0], (14.0, 21.0], (14.0, 21.0], (14.0, 21.0]]
Length: 22
Categories (3, interval[float64]): [(-0.0, 7.0] < (7.0, 14.0] < (14.0, 21.0]]

### qcut

In [27]:
data = np.random.randn(200)

In [28]:
cats = pd.qcut(data, 4, precision=2)
cats

[(-3.1399999999999997, -0.73], (0.57, 3.14], (0.57, 3.14], (-0.73, -0.058], (-3.1399999999999997, -0.73], ..., (-0.058, 0.57], (-3.1399999999999997, -0.73], (-0.73, -0.058], (-0.058, 0.57], (-0.058, 0.57]]
Length: 200
Categories (4, interval[float64]): [(-3.1399999999999997, -0.73] < (-0.73, -0.058] < (-0.058, 0.57] < (0.57, 3.14]]

In [29]:
cats = pd.qcut(data, 4, precision=2)
cats

[(-3.1399999999999997, -0.73], (0.57, 3.14], (0.57, 3.14], (-0.73, -0.058], (-3.1399999999999997, -0.73], ..., (-0.058, 0.57], (-3.1399999999999997, -0.73], (-0.73, -0.058], (-0.058, 0.57], (-0.058, 0.57]]
Length: 200
Categories (4, interval[float64]): [(-3.1399999999999997, -0.73] < (-0.73, -0.058] < (-0.058, 0.57] < (0.57, 3.14]]

In [30]:
cats.codes

array([0, 3, 3, 1, 0, 3, 3, 1, 0, 1, 3, 1, 0, 0, 3, 2, 0, 3, 1, 0, 0, 0,
       1, 0, 1, 3, 3, 2, 2, 0, 3, 3, 1, 0, 2, 1, 3, 2, 0, 1, 2, 0, 1, 2,
       0, 3, 1, 1, 3, 0, 2, 3, 0, 1, 0, 2, 0, 3, 2, 3, 2, 3, 0, 1, 0, 1,
       0, 2, 3, 3, 2, 3, 2, 0, 0, 3, 3, 3, 2, 3, 0, 1, 0, 1, 1, 2, 3, 2,
       1, 0, 3, 2, 1, 0, 2, 0, 2, 3, 2, 2, 1, 2, 3, 3, 1, 3, 0, 2, 3, 1,
       2, 2, 1, 3, 3, 3, 2, 1, 3, 1, 2, 0, 3, 3, 1, 2, 3, 1, 0, 0, 1, 1,
       0, 1, 1, 2, 0, 1, 0, 1, 1, 1, 3, 1, 1, 0, 2, 2, 0, 1, 2, 3, 1, 2,
       1, 0, 0, 0, 0, 0, 0, 3, 0, 2, 1, 2, 2, 0, 1, 1, 0, 2, 3, 3, 2, 2,
       2, 0, 3, 3, 3, 2, 3, 2, 2, 0, 2, 2, 3, 1, 1, 3, 1, 2, 1, 2, 0, 1,
       2, 2], dtype=int8)

In [31]:
pd.value_counts(cats)

(0.57, 3.14]                    50
(-0.058, 0.57]                  50
(-0.73, -0.058]                 50
(-3.1399999999999997, -0.73]    50
dtype: int64

In [32]:
pd.qcut(data, [0, 0.1, 0.2, 0.4, 0.9, 1])

[(-3.129, -1.358], (-0.355, 1.161], (-0.355, 1.161], (-0.932, -0.355], (-1.358, -0.932], ..., (-0.355, 1.161], (-3.129, -1.358], (-0.932, -0.355], (-0.355, 1.161], (-0.355, 1.161]]
Length: 200
Categories (5, interval[float64]): [(-3.129, -1.358] < (-1.358, -0.932] < (-0.932, -0.355] < (-0.355, 1.161] < (1.161, 3.144]]