# Pandas basics

In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
ser = pd.Series(np.random.randn(10))
ser

0    0.344182
1   -1.448636
2    0.119668
3   -0.999132
4    0.028571
5    0.929146
6    0.093960
7   -0.201939
8    0.544314
9   -0.737950
dtype: float64

In [3]:
ser = pd.Series(np.random.randn(5), index='a b c d e'.split())
ser

a    1.002483
b   -0.122488
c   -2.181720
d   -1.725164
e   -1.271648
dtype: float64

In [4]:
ser = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
ser['c']

3

In [6]:
ser = pd.Series(np.random.randn(100))
ser.mean()

0.08800904907086612

In [7]:
ser.median()

0.020064299464951703

In [8]:
ser.var()

1.0354086510378686

In [9]:
ser.describe()

count    100.000000
mean       0.088009
std        1.017550
min       -3.074751
25%       -0.554583
50%        0.020064
75%        0.955163
max        2.391258
dtype: float64

## Data frame 

In [10]:
df = pd.DataFrame(np.random.randn(5,4))
df

Unnamed: 0,0,1,2,3
0,-0.170893,-2.723081,1.385888,-0.724211
1,-0.433773,0.387592,-2.265242,-0.438488
2,0.886766,-0.25023,-0.248494,-0.068829
3,-0.266853,0.272128,-0.428734,0.84326
4,1.330033,0.018149,0.12344,1.658537


In [11]:
df = pd.DataFrame(np.random.randn(5,4), 'a b c d e'.split(), 'w x y z'.split())
df

Unnamed: 0,w,x,y,z
a,0.507598,0.675126,-1.46641,-0.304392
b,-1.502247,0.808809,1.241736,1.580406
c,1.778099,-0.924317,-0.320147,-0.896273
d,-1.153898,1.14404,-1.450164,0.357539
e,0.240976,-1.134216,0.486867,1.130715


In [12]:
# fetch by colummn
df['w']

a    0.507598
b   -1.502247
c    1.778099
d   -1.153898
e    0.240976
Name: w, dtype: float64

In [13]:
df['w']['a']

0.5075978100816478

In [14]:
# Fetch by row index
df.loc['a']

w    0.507598
x    0.675126
y   -1.466410
z   -0.304392
Name: a, dtype: float64

In [15]:
# Fetch by row number
df.iloc[1]

w   -1.502247
x    0.808809
y    1.241736
z    1.580406
Name: b, dtype: float64

In [16]:
# Condition selection 
df[df > 0]

Unnamed: 0,w,x,y,z
a,0.507598,0.675126,,
b,,0.808809,1.241736,1.580406
c,1.778099,,,
d,,1.14404,,0.357539
e,0.240976,,0.486867,1.130715


In [17]:
# Conditions for columns
df[df['x'] > 0]

Unnamed: 0,w,x,y,z
a,0.507598,0.675126,-1.46641,-0.304392
b,-1.502247,0.808809,1.241736,1.580406
d,-1.153898,1.14404,-1.450164,0.357539


In [18]:
df[(df['x']>0) & (df['y']>0)]

Unnamed: 0,w,x,y,z
b,-1.502247,0.808809,1.241736,1.580406


In [19]:
# convert to numpy array
df.values

array([[ 0.50759781,  0.67512621, -1.46641036, -0.30439232],
       [-1.50224671,  0.8088093 ,  1.2417359 ,  1.58040585],
       [ 1.77809918, -0.9243172 , -0.32014656, -0.89627339],
       [-1.15389756,  1.14403958, -1.45016364,  0.35753908],
       [ 0.24097561, -1.13421555,  0.48686679,  1.13071539]])

In [20]:
# Hierarchical index
main_index = ['Google', 'Microsoft']
secondary_index = ['2020-01-15', '2020-01-14', '2020-01-13']
hier_index = pd.MultiIndex.from_product([main_index, secondary_index])

In [21]:
prices_df = pd.DataFrame([[1435, 1432], [1436, 1432], [1433, 1434], [162, 161], [162, 164], [162, 166]], 
                         hier_index,
                         [9, 10])
prices_df

Unnamed: 0,Unnamed: 1,9,10
Google,2020-01-15,1435,1432
Google,2020-01-14,1436,1432
Google,2020-01-13,1433,1434
Microsoft,2020-01-15,162,161
Microsoft,2020-01-14,162,164
Microsoft,2020-01-13,162,166


In [22]:
prices_df.loc['Google']

Unnamed: 0,9,10
2020-01-15,1435,1432
2020-01-14,1436,1432
2020-01-13,1433,1434


In [23]:
prices_df[9]

Google     2020-01-15    1435
           2020-01-14    1436
           2020-01-13    1433
Microsoft  2020-01-15     162
           2020-01-14     162
           2020-01-13     162
Name: 9, dtype: int64

In [24]:
# with multi-index, column is still a series
type(prices_df[9])

pandas.core.series.Series

In [25]:
prices_df.index.names = ['Company', 'Date']
prices_df

Unnamed: 0_level_0,Unnamed: 1_level_0,9,10
Company,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
Google,2020-01-15,1435,1432
Google,2020-01-14,1436,1432
Google,2020-01-13,1433,1434
Microsoft,2020-01-15,162,161
Microsoft,2020-01-14,162,164
Microsoft,2020-01-13,162,166


In [26]:
# filter on inner index
prices_df.xs('2020-01-15', level='Date')

Unnamed: 0_level_0,9,10
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Google,1435,1432
Microsoft,162,161
