In [7]:
!type data\demo.csv

id,name,age,gender,grade
1,Steven,20,Male,80
2,Elsa,18,Femail,100
3,John,15,Male,90


In [8]:
import pandas as pd

df = pd.read_csv('data/demo.csv')
df

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20,Male,80
1,2,Elsa,18,Femail,100
2,3,John,15,Male,90


In [9]:
df = pd.read_csv('data/demo.table', sep='\t')
df

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20 Male,80,
1,2,Elsa,18 Female 100,,
2,3,John,15 Male,90,
3,4,Sally,20,Female,100.0
4,5,Jack,28 Male,90,
5,6 Rose 18 Female 100,,,,


In [10]:
df = pd.read_csv('data/demo.table', sep=' ')
df

ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 5


In [11]:
df = pd.read_csv('data/demo.table', sep='\s+')
df

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20,Male,80
1,2,Elsa,18,Female,100
2,3,John,15,Male,90
3,4,Sally,20,Female,100
4,5,Jack,28,Male,90
5,6,Rose,18,Female,100


### Missing data

In [12]:
from numpy import nan

df.iloc[2:4,2]=nan
df.iloc[2,3]=nan

df

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,,,90
3,4,Sally,,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


In [13]:
df.dropna()

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


In [14]:
df.fillna(0)

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,0.0,0,90
3,4,Sally,0.0,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


In [15]:
df.fillna({'age':0,'gender':'Unknown'})

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,0.0,Unknown,90
3,4,Sally,0.0,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


In [16]:
df.fillna(method='ffill', limit=1)

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,18.0,Female,90
3,4,Sally,,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


In [17]:
df.fillna(df.mean())

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,21.0,,90
3,4,Sally,21.0,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


### Duplicated values

In [18]:
newitem = {'id':6,'name':'Rose', 'age':18.0, 'gender':'Female','grade':100}
df_dup = df.append(newitem, ignore_index=True)
df_dup

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,,,90
3,4,Sally,,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100
6,6,Rose,18.0,Female,100


In [19]:
df_dup.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [20]:
df_dup.drop_duplicates()

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,,,90
3,4,Sally,,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


In [21]:
# specify a subset of column to drop the duplicated
df_dup.drop_duplicates(['age', 'gender'])

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,,,90
3,4,Sally,,Female,100
4,5,Jack,28.0,Male,90


### Data Transform

In [22]:
df.gender.apply(lambda x:1 if x == 'Female' else 0)

0    0
1    1
2    0
3    1
4    0
5    1
Name: gender, dtype: int64

In [23]:
df

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,Male,80
1,2,Elsa,18.0,Female,100
2,3,John,,,90
3,4,Sally,,Female,100
4,5,Jack,28.0,Male,90
5,6,Rose,18.0,Female,100


In [24]:
df['gender'] = df.gender.apply(lambda x:1 if x == 'Female' else 0)
df

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20.0,0,80
1,2,Elsa,18.0,1,100
2,3,John,,0,90
3,4,Sally,,1,100
4,5,Jack,28.0,0,90
5,6,Rose,18.0,1,100


In [25]:
df = pd.read_csv('data/demo.table', sep='\s+')

#map
grade_mapper = {
    100:'A+',
    90:'A',
    80:'B',
    70:'C'
}
df_graded = df.grade.map(grade_mapper)
df_graded

0     B
1    A+
2     A
3    A+
4     A
5    A+
Name: grade, dtype: object

In [26]:
df['grade']=df_graded
df

Unnamed: 0,id,name,age,gender,grade
0,1,Steven,20,Male,B
1,2,Elsa,18,Female,A+
2,3,John,15,Male,A
3,4,Sally,20,Female,A+
4,5,Jack,28,Male,A
5,6,Rose,18,Female,A+


In [27]:
df.id.map(lambda x:x*10)

0    10
1    20
2    30
3    40
4    50
5    60
Name: id, dtype: int64

In [28]:
df.grade.replace('A+', 'A')

0    B
1    A
2    A
3    A
4    A
5    A
Name: grade, dtype: object

In [29]:
df.age.replace([0,20], 20)

0    20
1    18
2    15
3    20
4    28
5    18
Name: age, dtype: int64

In [30]:
df.columns = df.columns.map(lambda x:x.upper())
# to modify index, use df.index.map
df

Unnamed: 0,ID,NAME,AGE,GENDER,GRADE
0,1,Steven,20,Male,B
1,2,Elsa,18,Female,A+
2,3,John,15,Male,A
3,4,Sally,20,Female,A+
4,5,Jack,28,Male,A
5,6,Rose,18,Female,A+


In [31]:
df

Unnamed: 0,ID,NAME,AGE,GENDER,GRADE
0,1,Steven,20,Male,B
1,2,Elsa,18,Female,A+
2,3,John,15,Male,A
3,4,Sally,20,Female,A+
4,5,Jack,28,Male,A
5,6,Rose,18,Female,A+


In [32]:
df.rename(index={5:55555},columns={'GRADE':'GRA'})

Unnamed: 0,ID,NAME,AGE,GENDER,GRA
0,1,Steven,20,Male,B
1,2,Elsa,18,Female,A+
2,3,John,15,Male,A
3,4,Sally,20,Female,A+
4,5,Jack,28,Male,A
55555,6,Rose,18,Female,A+


### Discretization and Binning

In [33]:
import json
df = pd.read_json('data/acc-demo1.json')
df

Unnamed: 0,x,y,z
0,0.0560,-0.6716,-0.7525
1,0.0778,-0.6766,-0.7276
2,0.0673,-0.6601,-0.7863
3,0.0464,-0.6532,-0.7278
4,0.0354,-0.6551,-0.7671
...,...,...,...
17890,0.0333,-0.7142,-0.6958
17891,0.0472,-0.7123,-0.6898
17892,0.0469,-0.7097,-0.6989
17893,0.0397,-0.7109,-0.6955


In [34]:
bins = [-3,-1,-0.5, 0, 0.5, 1,3]
interval = pd.cut(df.z, bins)
interval

0        (-1.0, -0.5]
1        (-1.0, -0.5]
2        (-1.0, -0.5]
3        (-1.0, -0.5]
4        (-1.0, -0.5]
             ...     
17890    (-1.0, -0.5]
17891    (-1.0, -0.5]
17892    (-1.0, -0.5]
17893    (-1.0, -0.5]
17894    (-1.0, -0.5]
Name: z, Length: 17895, dtype: category
Categories (6, interval[float64]): [(-3.0, -1.0] < (-1.0, -0.5] < (-0.5, 0.0] < (0.0, 0.5] < (0.5, 1.0] < (1.0, 3.0]]

In [35]:
pd.value_counts(interval)

(-1.0, -0.5]    17762
(-3.0, -1.0]      103
(-0.5, 0.0]        30
(1.0, 3.0]          0
(0.5, 1.0]          0
(0.0, 0.5]          0
Name: z, dtype: int64

In [36]:
# pass an integer nunber of bins to cut instead of explicit bin edges
# it will compute equal-length bins based on the minimum and maximum values in the data
interval = pd.cut(df.z, 10, precision=2)
interval

0        (-0.83, -0.73]
1        (-0.83, -0.73]
2        (-0.83, -0.73]
3        (-0.83, -0.73]
4        (-0.83, -0.73]
              ...      
17890    (-0.73, -0.63]
17891    (-0.73, -0.63]
17892    (-0.73, -0.63]
17893    (-0.73, -0.63]
17894    (-0.73, -0.63]
Name: z, Length: 17895, dtype: category
Categories (10, interval[float64]): [(-1.33, -1.23] < (-1.23, -1.13] < (-1.13, -1.03] < (-1.03, -0.93] ... (-0.73, -0.63] < (-0.63, -0.53] < (-0.53, -0.43] < (-0.43, -0.32]]

In [37]:
pd.value_counts(interval)

(-0.83, -0.73]    11097
(-0.73, -0.63]     3837
(-1.03, -0.93]     2142
(-0.93, -0.83]      453
(-0.63, -0.53]      269
(-1.13, -1.03]       41
(-0.53, -0.43]       34
(-0.43, -0.32]       10
(-1.23, -1.13]        7
(-1.33, -1.23]        5
Name: z, dtype: int64

In [38]:
# cut with quanties
interval = pd.qcut(df.z, 4)
pd.value_counts(interval)

(-1.3319999999999999, -0.773]    4524
(-0.756, -0.729]                 4502
(-0.729, -0.325]                 4439
(-0.773, -0.756]                 4430
Name: z, dtype: int64

### Sampling

In [39]:
df.sample(n=10)

Unnamed: 0,x,y,z
16931,0.0371,-0.7073,-0.7053
12629,-0.1828,-0.3046,-0.9079
4867,-0.1091,-0.6359,-0.7331
11492,-0.0559,-0.1822,-0.968
11646,-0.0416,-0.209,-0.9579
6797,-0.1122,-0.6506,-0.7293
988,0.022,-0.6482,-0.7463
16657,0.0318,-0.6812,-0.7214
12506,-0.0277,-0.2076,-0.9661
3436,0.0294,-0.6417,-0.7578


### Permutation

In [40]:
import numpy as np

neworder = np.random.permutation(len(df))
neworder

array([ 7633, 13515,  4237, ..., 10479, 11272,  3951])

In [41]:
df.take(neworder)

Unnamed: 0,x,y,z
7633,-0.1203,-0.6561,-0.7233
13515,-0.0327,-0.6245,-0.7762
4237,0.0029,-0.6371,-0.7544
3817,0.0132,-0.6292,-0.7736
13097,-0.0394,-0.6122,-0.7757
...,...,...,...
17740,0.0385,-0.7138,-0.6922
11047,-0.0643,-0.1829,-0.9632
10479,0.2095,-0.2648,-0.9436
11272,-0.0562,-0.1925,-0.9574


### Outliers

In [42]:
import json
df = pd.read_json('data/acc-demo1.json')
df

Unnamed: 0,x,y,z
0,0.0560,-0.6716,-0.7525
1,0.0778,-0.6766,-0.7276
2,0.0673,-0.6601,-0.7863
3,0.0464,-0.6532,-0.7278
4,0.0354,-0.6551,-0.7671
...,...,...,...
17890,0.0333,-0.7142,-0.6958
17891,0.0472,-0.7123,-0.6898
17892,0.0469,-0.7097,-0.6989
17893,0.0397,-0.7109,-0.6955


In [43]:
df.describe()

Unnamed: 0,x,y,z
count,17895.0,17895.0,17895.0
mean,-0.037648,-0.593152,-0.771278
std,0.073596,0.15743,0.085195
min,-0.5219,-0.9041,-1.3307
25%,-0.0993,-0.6582,-0.7731
50%,-0.0315,-0.6424,-0.7558
75%,0.0168,-0.6249,-0.7293
max,0.6053,-0.0606,-0.3249


In [44]:
import numpy as np

df.z[np.abs(df.z)>1]

4620    -1.0010
4625    -1.0688
5046    -1.0214
5834    -1.0051
5874    -1.0163
          ...  
12918   -1.0582
12988   -1.2594
12989   -1.1464
15237   -1.2280
15279   -1.0422
Name: z, Length: 103, dtype: float64

In [45]:
df[(np.abs(df)>0.7).any(axis=1)]

Unnamed: 0,x,y,z
0,0.0560,-0.6716,-0.7525
1,0.0778,-0.6766,-0.7276
2,0.0673,-0.6601,-0.7863
3,0.0464,-0.6532,-0.7278
4,0.0354,-0.6551,-0.7671
...,...,...,...
17890,0.0333,-0.7142,-0.6958
17891,0.0472,-0.7123,-0.6898
17892,0.0469,-0.7097,-0.6989
17893,0.0397,-0.7109,-0.6955


In [46]:
np.abs(df)>0.7

Unnamed: 0,x,y,z
0,False,False,True
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True
...,...,...,...
17890,False,True,False
17891,False,True,False
17892,False,True,False
17893,False,True,False


In [47]:
(np.abs(df)>0.7).any(0)

x    False
y     True
z     True
dtype: bool

In [48]:
(np.abs(df)>0.7).any(1)

0        True
1        True
2        True
3        True
4        True
         ... 
17890    True
17891    True
17892    True
17893    True
17894    True
Length: 17895, dtype: bool

In [49]:
df<df.quantile(0.75)

Unnamed: 0,x,y,z
0,False,True,True
1,False,True,False
2,False,True,True
3,False,True,False
4,False,True,True
...,...,...,...
17890,False,True,False
17891,False,True,False
17892,False,True,False
17893,False,True,False


In [50]:
df.std()

x    0.073596
y    0.157430
z    0.085195
dtype: float64

In [51]:
df.mean()

x   -0.037648
y   -0.593152
z   -0.771278
dtype: float64