# Pandas basics

In [30]:
import numpy as np
import pandas as pd

## Series

In [32]:
ser = pd.Series(np.random.randn(10))
ser

0    0.423386
1   -0.202112
2    0.697944
3    1.018605
4   -0.805657
5   -1.709229
6   -0.166751
7    0.053516
8   -0.683041
9   -0.446597
dtype: float64

In [33]:
ser = pd.Series(np.random.randn(5), index='a b c d e'.split())
ser

a    1.174303
b    1.035019
c   -0.835369
d   -1.523295
e    0.545093
dtype: float64

In [36]:
ser = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [38]:
ser['c']

3

In [43]:
ser = pd.Series(np.random.randn(100))
ser.mean()

0.12028544352151943

In [44]:
ser.median()

0.12471262626167998

In [48]:
ser.var()

0.7527787536105254

In [49]:
ser.describe()

count    100.000000
mean       0.120285
std        0.867628
min       -2.471970
25%       -0.399500
50%        0.124713
75%        0.783347
max        2.465886
dtype: float64

## Data frame 

In [72]:
df = pd.DataFrame(np.random.randn(5,4))
df

Unnamed: 0,0,1,2,3
0,-2.608283,1.488906,-0.704879,1.04868
1,1.433952,0.51005,1.353018,-0.312053
2,-0.699344,-0.219292,-0.955811,1.436855
3,1.161468,0.825604,1.652385,0.728545
4,1.631819,0.295694,-0.323018,0.868905


In [73]:
df = pd.DataFrame(np.random.randn(5,4), 'a b c d e'.split(), 'w x y z'.split())
df

Unnamed: 0,w,x,y,z
a,-0.210523,-2.001974,1.945387,-0.448872
b,-1.081107,0.245848,-0.05645,-0.734369
c,-1.035441,1.785701,1.060465,0.691069
d,-0.969721,-0.822723,-0.441322,1.231352
e,-0.46613,-1.320579,-1.120505,0.57671


In [74]:
# fetch by colummn
df['w']

a   -0.210523
b   -1.081107
c   -1.035441
d   -0.969721
e   -0.466130
Name: w, dtype: float64

In [75]:
df['w']['a']

-0.21052261433665737

In [103]:
# Fetch by row index
df.loc['a']

w   -0.210523
x   -2.001974
y    1.945387
z   -0.448872
Name: a, dtype: float64

In [104]:
# Fetch by row number
df.iloc[1]

w   -1.081107
x    0.245848
y   -0.056450
z   -0.734369
Name: b, dtype: float64

In [105]:
# Condition selection 
df[df > 0]

Unnamed: 0,w,x,y,z
a,,,1.945387,
b,,0.245848,,
c,,1.785701,1.060465,0.691069
d,,,,1.231352
e,,,,0.57671


In [107]:
# Conditions for columns
df[df['x'] > 0]

Unnamed: 0,w,x,y,z
b,-1.081107,0.245848,-0.05645,-0.734369
c,-1.035441,1.785701,1.060465,0.691069


In [109]:
df[(df['x']>0) & (df['y']>0)]

Unnamed: 0,w,x,y,z
c,-1.035441,1.785701,1.060465,0.691069


In [110]:
# convert to numpy array
df.values

array([[-0.21052261, -2.00197363,  1.94538735, -0.44887235],
       [-1.08110652,  0.24584759, -0.05645045, -0.73436904],
       [-1.03544051,  1.78570126,  1.06046514,  0.69106865],
       [-0.96972062, -0.82272295, -0.44132198,  1.23135242],
       [-0.46613028, -1.32057901, -1.12050538,  0.57670952]])

In [125]:
# Hierarchical index
main_index = ['Google', 'Microsoft']
secondary_index = ['2020-01-15', '2020-01-14', '2020-01-13']
#hier_index = [(company, date) for company in main_index for date in secondary_index]
pd.MultiIndex.from_product([main_index, secondary_index])

MultiIndex(levels=[['Google', 'Microsoft'], ['2020-01-13', '2020-01-14', '2020-01-15']],
           labels=[[0, 0, 0, 1, 1, 1], [2, 1, 0, 2, 1, 0]])

In [None]:
prices_df = pd.DataFrame([[1435, 1432, 1436, 162, 163, 164], [1433, 1434, 1434, 161, 162, 166]], hier_index, ['A', 'B'])
prices_df