### Operating on Data in Pandas

Ufuncs: (universal function )Index Preservation

In [2]:
import pandas as pd 
import numpy as np 

In [3]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [4]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [5]:
np.exp(ser) # chuyen sang numpy giu nguyen index


0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [6]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


uFuncs (unviersal function)  : index alignment (chi so sap xep)

In [7]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
print (type(area))
print(population)

<class 'pandas.core.series.Series'>
California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64


In [8]:
population / area  # lay gia ung vs chi so chung chia so nhau, neu ko co thi se la NaN

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [9]:
area.index | population.index


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [10]:
A = pd.Series([2, 4, 6], index=[0, 1, 2]) #chi lay nhung cai giao cua 2 tap hop
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [11]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [12]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


In [13]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [14]:
A + B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [15]:
fill = A.stack().mean()

A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


### Operations Between DataFrame and Series

In [16]:
A = rng.randint(10, size=(3, 4))
A

array([[3, 8, 2, 4],
       [2, 6, 4, 8],
       [6, 1, 3, 8]])

In [17]:
A - A[0] # tru di phan tu 0

array([[ 0,  0,  0,  0],
       [-1, -2,  2,  4],
       [ 3, -7,  1,  4]])

In [18]:
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]
#print(df.iloc[0])

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-1,-2,2,4
2,3,-7,1,4


In [19]:
df.subtract(df['R'], axis=0)

df

Unnamed: 0,Q,R,S,T
0,3,8,2,4
1,2,6,4,8
2,6,1,3,8


In [20]:
halfrow = df.iloc[0, ::2]
halfrow

Q    3
S    2
Name: 0, dtype: int64

In [21]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-1.0,,2.0,
2,3.0,,1.0,


### Handling Missing Data 

In [22]:
import numpy as np 
import pandas as pd 

In [23]:
val1 = np.array([1,None,2 ,3 ,4])
val1

array([1, None, 2, 3, 4], dtype=object)

In [24]:
for dtype in ['object', 'int']: 
    print ('dtype =', dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
10 loops, best of 3: 53.5 ms per loop

dtype = int
1000 loops, best of 3: 1.68 ms per loop



In [25]:
val1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [None]:
val2 = np.array([1,np.nan, 3, 4])
val2.dtype

In [None]:
x = pd.Series(range(2), dtype=int)
x

In [None]:
x[0] = None
x

###  Operating on Null Values 

In [None]:
data = pd.Series([1, np.nan, 'hello', None])


In [None]:
data.isnull()


In [None]:
data[data.notnull()]


###  Dropping null values 


In [None]:
data.dropna()

In [None]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

In [None]:
df.dropna()

In [None]:
df.dropna(axis='columns')

In [None]:
df[3] = np.nan
df

In [None]:
df.dropna(axis='columns', how='all')


In [None]:
df.dropna(axis='rows', thresh=3) # gia tri toi thieu non nll values tren rows


###  FIlling null values 

In [None]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

In [None]:
data.fillna(0) # do dl nan = 0 

In [None]:
# forward-fill
data.fillna(method='ffill') #do dl tu trc do ra phia sau 

In [None]:
data.fillna(method='bfill') # do dl ra phia duoi 

In [None]:
df

In [None]:
df.fillna(method='ffill', axis =1 ) # axis 1 theo rows do dl tu trai qua phai vs cell = Nan
# axis = 0 theo col 
# trc do ko co dl thi cell NaN not change

##  Hierarchical Indexing 

dịch là thứ bậc index 
đưa dữ liệu ở chiều cao hơn 3d,4d chuyển về series and dataframe
chúng ta sẽ phân tách, lập chỉ mục, thông kê

In [None]:
import pandas as pd 
import numpy as np 

In [None]:
#A Multiply Indexed Series

In [None]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

In [None]:
#print(type(index))
pop[[i for i in pop.index if i[1] == 2010]]


###  The Better Way: Pandas MultiIndex

In [None]:
index = pd.MultiIndex.from_tuples(index)
index

In [None]:
pop = pop.reindex(index)
pop

In [None]:
pop[:, 2010]

### MultiIndex as extra dimension

In [None]:
pop_df = pop.unstack()
pop_df

In [None]:
pop_df.stack()

In [None]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

In [None]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

### Methods of MultiIndex Creation

In [None]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

In [None]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)

### Explicit MultiIndex constructors

In [None]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])


In [None]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

In [None]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

### MultiIndex level names

In [26]:
pop.index.names = ['state', 'year']
pop

NameError: name 'pop' is not defined

### MultiIndex for columns

In [27]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
type(health_data)

pandas.core.frame.DataFrame

In [28]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,33.0,35.6
2013,2,36.0,37.5
2014,1,27.0,36.5
2014,2,37.0,35.3


## Indexing and Slicing a MultiIndex

Multiply indexed Series

In [29]:
pop

NameError: name 'pop' is not defined

In [30]:
pop['California',2000]

NameError: name 'pop' is not defined

In [31]:
pop.loc['California':'New York'] # cho phep slicing theo name col 

NameError: name 'pop' is not defined

In [32]:
pop[1:2] # bt lay theo index

NameError: name 'pop' is not defined

In [33]:
 pop[ : ,2000] # lay theo nam 

NameError: name 'pop' is not defined

In [34]:
pop[pop > 22000000]# lay theo condition values > 

NameError: name 'pop' is not defined

In [35]:
pop[['California', 'Texas']]

NameError: name 'pop' is not defined

###  Multiply indexed DataFrames

In [36]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,37.5,33.0,35.6,44.0,36.9
2013,2,46.0,38.1,36.0,37.5,40.0,37.5
2014,1,30.0,37.6,27.0,36.5,34.0,38.1
2014,2,40.0,37.6,37.0,35.3,28.0,37.9


In [37]:
health_data['Guido', 'HR'] # chi dich danh col can lay


year  visit
2013  1        33.0
      2        36.0
2014  1        27.0
      2        37.0
Name: (Guido, HR), dtype: float64

In [38]:
print (health_data.iloc[:2, :]) # lay theo index)
print (health_data.iloc[:, :2])# lay theo row)


subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      37.0  37.5  33.0  35.6  44.0  36.9
     2      46.0  38.1  36.0  37.5  40.0  37.5
subject      Bob      
type          HR  Temp
year visit            
2013 1      37.0  37.5
     2      46.0  38.1
2014 1      30.0  37.6
     2      40.0  37.6


In [39]:
health_data.loc[(:), (:,:)]
# errow khi dung loc

SyntaxError: invalid syntax (<ipython-input-39-e848392660e7>, line 1)

In [40]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,37.0,33.0,44.0
2014,1,30.0,27.0,34.0


## Rearranging Multi-Indices

Sorted and unsorted indices

In [41]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.904909
      2      0.922173
c     1      0.340727
      2      0.752493
b     1      0.226877
      2      0.057899
dtype: float64

In [42]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [43]:
data = data.sort_index()
data

char  int
a     1      0.904909
      2      0.922173
b     1      0.226877
      2      0.057899
c     1      0.340727
      2      0.752493
dtype: float64

In [44]:
data['a':'b']

char  int
a     1      0.904909
      2      0.922173
b     1      0.226877
      2      0.057899
dtype: float64

### Stacking and unstacking indices

In [45]:
pop.unstack(level=0)

NameError: name 'pop' is not defined

In [46]:
pop.unstack(level=1)

NameError: name 'pop' is not defined

In [47]:
pop.unstack().stack()

NameError: name 'pop' is not defined

In [48]:
pop_flat = pop.reset_index(name='population')
pop_flat

NameError: name 'pop' is not defined

In [49]:
pop_flat.set_index(['state', 'year'])


NameError: name 'pop_flat' is not defined

In [50]:
health_data


Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,37.5,33.0,35.6,44.0,36.9
2013,2,46.0,38.1,36.0,37.5,40.0,37.5
2014,1,30.0,37.6,27.0,36.5,34.0,38.1
2014,2,40.0,37.6,37.0,35.3,28.0,37.9


In [51]:
data_mean = health_data.mean(level='year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,41.5,37.8,34.5,36.55,42.0,37.2
2014,35.0,37.6,32.0,35.9,31.0,38.0


In [52]:
data_mean.mean(axis=1, level='type')


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,39.333333,37.183333
2014,32.666667,37.166667


## Aside: Panel Data  