# Pandas Notes:

## Series and DataFrames:

In [5]:
import numpy as np
import pandas as pd

## Pandas Series:

- The dtypes using in Pandas...
- These are used to build Data Frames (& similar to numpy objects)
- They can be labelled.

In [8]:
labels = [ 'a', 'b' , 'c']
data = [10,20,30]
arr = np.array(data)
d = {}
for label, item in zip(labels, data):
    d[label] = item
    
d

{'a': 10, 'b': 20, 'c': 30}

In [3]:
pd.Series(data = data)

0    10
1    20
2    30
dtype: int64

In [4]:
pd.Series(data=data, index = labels)

a    10
b    20
c    30
dtype: int64

In [9]:
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int32

In [10]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [11]:
# Holding functions in series:

sqr = lambda n: n**2

pd.Series([sum, sqr, len, print])

0                      <built-in function sum>
1    <function <lambda> at 0x000001E9437AE5F0>
2                      <built-in function len>
3                    <built-in function print>
dtype: object

## Grabbing Values/Data from Series:

In [12]:
countryNames = ['India', 'USA', 'Canada', 'Russia', 'Japan']
rankings = [1,2,3,4,5]

ser1 = pd.Series(rankings, countryNames)
ser2 = pd.Series([1,2,3,5,4], ['India', 'USA', 'Canada', 'AUS', 'Russia'])

In [13]:
ser1

India     1
USA       2
Canada    3
Russia    4
Japan     5
dtype: int64

In [14]:
ser2

India     1
USA       2
Canada    3
AUS       5
Russia    4
dtype: int64

In [15]:
ser1['India']

1

In [17]:
ser3  = pd.Series(data=labels)
ser3[0]

'a'

**Performing operations on Series:**

In [18]:
ser1 + ser2

AUS       NaN
Canada    6.0
India     2.0
Japan     NaN
Russia    8.0
USA       4.0
dtype: float64

In [23]:
print(ser1 - ser2)
print('-'*25)
print(ser1 * ser2)
print('-'*25)
print(ser1 ** ser2)

AUS       NaN
Canada    0.0
India     0.0
Japan     NaN
Russia    0.0
USA       0.0
dtype: float64
-------------------------
AUS        NaN
Canada     9.0
India      1.0
Japan      NaN
Russia    16.0
USA        4.0
dtype: float64
-------------------------
AUS         NaN
Canada     27.0
India       1.0
Japan       NaN
Russia    256.0
USA         4.0
dtype: float64


In [24]:
ser1 is ser2

False

In [25]:
ser1 is not ser2

True

## Pandas Data Frames:

In [33]:
np.random.seed(101)

In [35]:
# Creating a Data Frame...

df = pd.DataFrame(
    np.random.randn(5,4), # data [5 x 4]
    ['A', 'B', 'C', 'D', 'E'], # row - labels
    ['W', 'X', 'Y', 'Z'] # col - labels
)
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [36]:
type(df)

pandas.core.frame.DataFrame

Creating a Dataframe from dictionery:

In [37]:
d = {
    'A' : [4,5,6],
    'B' : [5,8,1],
    'C' : [1,2,3]
}

pd.DataFrame(d)

Unnamed: 0,A,B,C
0,4,5,1
1,5,8,2
2,6,1,3


Indexing and Selection:

In [None]:
df['W'] # col selection

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [None]:
df.loc['A'] # row selection

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [54]:
df.loc['B', 'X']

0.39052784273374097

In [55]:
df.loc[
    ['A', 'B'],
    ['X', 'Y']
]

Unnamed: 0,X,Y
A,1.693723,-1.706086
B,0.390528,0.166905


In [53]:
df.iloc[1,2]

0.16690463609281317

In [52]:
df.iloc[[1,2]]

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646


New Column Addition:

In [45]:
df['new_col'] = df['W'] + df['X'] + df['Y'] + df['Z']
df

Unnamed: 0,W,X,Y,Z,new_col
A,0.302665,1.693723,-1.706086,-1.159119,-0.868817
B,-0.134841,0.390528,0.166905,0.184502,0.607094
C,0.807706,0.07296,0.638787,0.329646,1.849099
D,-0.497104,-0.75407,-0.943406,0.484752,-1.709828
E,-0.116773,1.901755,0.238127,1.996652,4.019761


Dropping / Deleting a Column:

In [46]:
df.drop('new_col', axis=1) # This won't be affecting an actual df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [48]:
df.drop('new_col', axis=1, inplace=True) # This will be affecting an actual df

In [49]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


Dropping / Deleting a ROW:

In [51]:
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


### Conditional Selections & Resetting/setting indexes:

In [56]:
dataset = np.random.randn(5,4)
rows = ['A', 'B', 'C', 'D', 'E']
cols = ['W', 'X', 'Y', 'Z']

In [57]:
df = pd.DataFrame(dataset, rows, cols)
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


Conditional Selection:

In [58]:
df > 0

Unnamed: 0,W,X,Y,Z
A,False,True,False,True
B,True,False,False,True
C,True,False,False,False
D,True,False,True,True
E,False,True,False,True


In [59]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,0.1968,,0.000366
B,1.025984,,,0.649826
C,2.154846,,,
D,0.147027,,0.558769,1.02481
E,,1.862864,,0.610478


In [61]:
df[df['W'] > 0] 

Unnamed: 0,W,X,Y,Z
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481


In [62]:
df[ df['W'] > 0]['X']

B   -0.156598
C   -0.610259
D   -0.479448
Name: X, dtype: float64

In [64]:
# multiple conditions:

(df['W'] > 0 ) & (df['Y'] > 1 )

A    False
B    False
C    False
D    False
E    False
dtype: bool

In [66]:
df[(df['W'] > 0 ) | (df['Y'] > 1 )]

Unnamed: 0,W,X,Y,Z
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481


Reset the index to Default:

In [67]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [68]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-0.993263,0.1968,-1.136645,0.000366
1,B,1.025984,-0.156598,-0.031579,0.649826
2,C,2.154846,-0.610259,-0.755325,-0.346419
3,D,0.147027,-0.479448,0.558769,1.02481
4,E,-0.925874,1.862864,-1.133817,0.610478


Set the index:

In [74]:
new_index = 'ca ny wy or co'.upper().split()
new_index

df['States'] = new_index
df

Unnamed: 0,W,X,Y,Z,States
A,-0.993263,0.1968,-1.136645,0.000366,CA
B,1.025984,-0.156598,-0.031579,0.649826,NY
C,2.154846,-0.610259,-0.755325,-0.346419,WY
D,0.147027,-0.479448,0.558769,1.02481,OR
E,-0.925874,1.862864,-1.133817,0.610478,CO


In [76]:
# setting this 'states' as the original index...

df.set_index('States', inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-0.993263,0.1968,-1.136645,0.000366
NY,1.025984,-0.156598,-0.031579,0.649826
WY,2.154846,-0.610259,-0.755325,-0.346419
OR,0.147027,-0.479448,0.558769,1.02481
CO,-0.925874,1.862864,-1.133817,0.610478


Multi-index and Index Hierarchy:

In [91]:
outside = 'G1 G1 G1 G2 G2 G2'.split()
inside = [1,2,3,1,2,3]
heir_index = list(zip(outside, inside))
heir_index = pd.MultiIndex.from_tuples(heir_index)
heir_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [92]:
df = pd.DataFrame(
    np.random.randn(6,2),
    heir_index, 
    ['A', 'B']
)
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-1.38292,1.482495
G1,2,0.961458,-2.141212
G1,3,0.992573,1.192241
G2,1,-1.04678,1.292765
G2,2,-1.467514,-0.494095
G2,3,-0.162535,0.485809


In [93]:
# Grabbing from multi-indexed DataFrame:

df.loc['G1']

Unnamed: 0,A,B
1,-1.38292,1.482495
2,0.961458,-2.141212
3,0.992573,1.192241


In [94]:
df.loc['G1'].loc[1]

A   -1.382920
B    1.482495
Name: 1, dtype: float64

In [95]:
df.loc['G1'].loc[2, 'B']

-2.1412122910809264

Naming Indexes:

In [96]:
df.index.names = ['Groups', 'Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-1.38292,1.482495
G1,2,0.961458,-2.141212
G1,3,0.992573,1.192241
G2,1,-1.04678,1.292765
G2,2,-1.467514,-0.494095
G2,3,-0.162535,0.485809


Cross Section Method:

- Returns cross-sections of rows/cols from series of Dataframes. (used in multi-level index)

In [97]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.38292,1.482495
2,0.961458,-2.141212
3,0.992573,1.192241


In [98]:
df.xs(1, level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-1.38292,1.482495
G2,-1.04678,1.292765
