# Data Formating

![gif](imgs/DF004.gif)

## Import

In [3]:
import pandas as pd
import numpy as np

## Data exceptions

In [5]:
np.random.seed(12345)
df = pd.DataFrame(np.random.randn(1000, 4))
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [7]:
c3 = df[3]
c3.head()

0   -0.555730
1    0.281746
2   -1.296221
3    0.886429
4   -0.438570
Name: 3, dtype: float64

In [8]:
c3[np.abs(c3) > 3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [13]:
df[(np.abs(df) > 3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [15]:
df[np.abs(df) > 3] = np.sign(df) * 3
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


## Shuffle and Random choosing

### permutation

In [20]:
df = pd.DataFrame(np.arange(20).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [21]:
sampler = np.random.permutation(5)
sampler

array([1, 3, 4, 0, 2])

In [22]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11


In [23]:
df.take(np.random.permutation(len(df))[:3])

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
0,0,1,2,3


### randint

In [25]:
bag = np.array([5,6,-1,6,4])

sampler = np.random.randint(0, len(bag), size=10)
sampler

array([0, 1, 2, 2, 3, 2, 1, 2, 0, 4])

In [26]:
draws = bag.take(sampler)
draws

array([ 5,  6, -1, -1,  6, -1,  6, -1,  5,  4])

## Calculating indicator vars

### get_dummies

In [27]:
df =pd.DataFrame({'key': list('bbacab'), 'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [28]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [32]:
d = pd.get_dummies(df['key'], prefix='key')
d

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [33]:
df_w_dummy = df[['data1']].join(d)
df_w_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


### cut

In [34]:
val = np.random.rand(10)
val

array([0.15314009, 0.5950288 , 0.38508854, 0.44740483, 0.32410284,
       0.92360521, 0.51923163, 0.53105727, 0.17332397, 0.34263784])

In [37]:
bins = [0, 0.2 ,0.4, 0.6, 0.8, 1]

c = pd.cut(val, bins)
c

[(0.0, 0.2], (0.4, 0.6], (0.2, 0.4], (0.4, 0.6], (0.2, 0.4], (0.8, 1.0], (0.4, 0.6], (0.4, 0.6], (0.0, 0.2], (0.2, 0.4]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [38]:
pd.get_dummies(c)

   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0           1           0           0           0           0
1           0           0           1           0           0
2           0           1           0           0           0
3           0           0           1           0           0
4           0           1           0           0           0
5           0           0           0           0           1
6           0           0           1           0           0
7           0           0           1           0           0
8           1           0           0           0           0
9           0           1           0           0           0

## String manipulations

### split

In [39]:
val = 'a,b, guido'
val.split(',')

['a', 'b', ' guido']

In [40]:
res = [x.strip() for x in val.split(',')]
res

['a', 'b', 'guido']

#### hard way

In [41]:
one, two, three = res
one + '::' + two + '::' + three

'a::b::guido'

#### easy way

In [42]:
'::'.join(res)

'a::b::guido'

### in

In [44]:
'guido' in val

True

### index

In [45]:
val.index(',')

1

In [49]:
try:
    val.index('4')
except ValueError:
    print('ValueError')

ValueError


### find

In [50]:
val.find(',')

1

In [51]:
val.find('4')

-1

### count

In [52]:
val.count(',')

2

### replace

In [53]:
val.replace(',', '::')

'a::b:: guido'

In [54]:
val.replace(' ', '').replace(',', '::')

'a::b::guido'