# 07/02

## Pandas Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pandas import Series

In [3]:
# assigning with lists; uses default indexes [0,1,2,...]
ser1 = Series([1,2,3,4,5])
ser1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
# assigning with lists and custom indexes
ser2 = Series(['hyderabad','chennai','bangalore','mumbai'], index=['A','B','C','A'])
ser2

A    hyderabad
B      chennai
C    bangalore
A       mumbai
dtype: object

In [5]:
ser2.index

Index(['A', 'B', 'C', 'A'], dtype='object')

In [6]:
ser2.values

array(['hyderabad', 'chennai', 'bangalore', 'mumbai'], dtype=object)

In [7]:
# assigning with numpy arrays; indexes have to be lists
ser3 = Series(np.arange(1,11), index=list('abcdefghij'))
ser3

a     1
b     2
c     3
d     4
e     5
f     6
g     7
h     8
i     9
j    10
dtype: int32

In [8]:
# assigning with dictionary
ser4 = Series({'b':'bangalore', 'c':'chennai', 'd':'delhi', 'h':'hyderabad'})
ser4

b    bangalore
c      chennai
d        delhi
h    hyderabad
dtype: object

In [9]:
ser3 + 10

a    11
b    12
c    13
d    14
e    15
f    16
g    17
h    18
i    19
j    20
dtype: int32

### Selecting Entries in Series

In [10]:
ser3['d'] # grab by index name

4

In [11]:
ser3[3] # grab by index

4

In [12]:
ser3[2:6] # grab by index range

c    3
d    4
e    5
f    6
dtype: int32

In [13]:
ser3[['a','f','g']] # grab by multiple index names

a    1
f    6
g    7
dtype: int32

In [14]:
ser3 > 5

a    False
b    False
c    False
d    False
e    False
f     True
g     True
h     True
i     True
j     True
dtype: bool

In [15]:
ser3[ser3>5] # grab by logic

f     6
g     7
h     8
i     9
j    10
dtype: int32

## Data Frames

In [16]:
from pandas import Series, DataFrame

In [17]:
df1 = DataFrame(np.arange(1,17).reshape(4,4))
df1

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12
3,13,14,15,16


In [18]:
print(df1)

    0   1   2   3
0   1   2   3   4
1   5   6   7   8
2   9  10  11  12
3  13  14  15  16


In [19]:
df2 = DataFrame(np.arange(1,26).reshape(5,5), index=list('abcde'), columns=['c1','c2','c3','c4','c5'])
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,1,2,3,4,5
b,6,7,8,9,10
c,11,12,13,14,15
d,16,17,18,19,20
e,21,22,23,24,25


### Selecting Entries in Data Frames

In [20]:
df2['c1'] # grab by column name

a     1
b     6
c    11
d    16
e    21
Name: c1, dtype: int32

In [21]:
df2[['c1','c4','c5']] # grab by multiple column names

Unnamed: 0,c1,c4,c5
a,1,4,5
b,6,9,10
c,11,14,15
d,16,19,20
e,21,24,25


In [22]:
df2[1:3] # grab by index range

Unnamed: 0,c1,c2,c3,c4,c5
b,6,7,8,9,10
c,11,12,13,14,15


In [23]:
df2 > 12

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,False
b,False,False,False,False,False
c,False,False,True,True,True
d,True,True,True,True,True
e,True,True,True,True,True


In [24]:
df2['c3'] > 12

a    False
b    False
c     True
d     True
e     True
Name: c3, dtype: bool

In [25]:
df2[df2['c3']>12] # checks condition on c3 and returns all rows where it is True

Unnamed: 0,c1,c2,c3,c4,c5
c,11,12,13,14,15
d,16,17,18,19,20
e,21,22,23,24,25


In [26]:
df2[df2>12]

Unnamed: 0,c1,c2,c3,c4,c5
a,,,,,
b,,,,,
c,,,13.0,14.0,15.0
d,16.0,17.0,18.0,19.0,20.0
e,21.0,22.0,23.0,24.0,25.0


In [27]:
# creating data frames using dictionaries
df1 = DataFrame({'Name':list('ABCDE'), 'Age':np.random.randint(20,25,5)}, index=[10,30,50,70,90])
df1

Unnamed: 0,Name,Age
10,A,23
30,B,24
50,C,23
70,D,24
90,E,22


In [28]:
stock_df = DataFrame({'Company':['TCS','Google','Yahoo','Wipro'], 'Ticker':['TCS','GOOG','YAH','WIP'], 
                      'Price':[100,200,300,400]})
stock_df

Unnamed: 0,Company,Ticker,Price
0,TCS,TCS,100
1,Google,GOOG,200
2,Yahoo,YAH,300
3,Wipro,WIP,400


In [29]:
stock_df[stock_df['Price']>200]

Unnamed: 0,Company,Ticker,Price
2,Yahoo,YAH,300
3,Wipro,WIP,400


### Adding columns

In [30]:
stock_df['Year'] = 2020
stock_df

Unnamed: 0,Company,Ticker,Price,Year
0,TCS,TCS,100,2020
1,Google,GOOG,200,2020
2,Yahoo,YAH,300,2020
3,Wipro,WIP,400,2020


In [31]:
stock_df['Year'] = [2020,2021,2019,2020]
stock_df

Unnamed: 0,Company,Ticker,Price,Year
0,TCS,TCS,100,2020
1,Google,GOOG,200,2021
2,Yahoo,YAH,300,2019
3,Wipro,WIP,400,2020


In [32]:
stock_df['DPrice'] = stock_df['Price']*0.9
stock_df

Unnamed: 0,Company,Ticker,Price,Year,DPrice
0,TCS,TCS,100,2020,90.0
1,Google,GOOG,200,2021,180.0
2,Yahoo,YAH,300,2019,270.0
3,Wipro,WIP,400,2020,360.0


In [33]:
stock_df[['Ticker','Price','DPrice']]

Unnamed: 0,Ticker,Price,DPrice
0,TCS,100,90.0
1,GOOG,200,180.0
2,YAH,300,270.0
3,WIP,400,360.0


In [34]:
stock_df['Head Office'] = Series(['HYD','MUM'], index=[0,2])
stock_df

Unnamed: 0,Company,Ticker,Price,Year,DPrice,Head Office
0,TCS,TCS,100,2020,90.0,HYD
1,Google,GOOG,200,2021,180.0,
2,Yahoo,YAH,300,2019,270.0,MUM
3,Wipro,WIP,400,2020,360.0,


### Deleting Columns

In [35]:
del stock_df['Year']
stock_df

Unnamed: 0,Company,Ticker,Price,DPrice,Head Office
0,TCS,TCS,100,90.0,HYD
1,Google,GOOG,200,180.0,
2,Yahoo,YAH,300,270.0,MUM
3,Wipro,WIP,400,360.0,


### Indexing

In [36]:
df5 = DataFrame(np.arange(1,26).reshape(5,5), index=list('abcde'), columns=list('vwxyz'))
df5

Unnamed: 0,v,w,x,y,z
a,1,2,3,4,5
b,6,7,8,9,10
c,11,12,13,14,15
d,16,17,18,19,20
e,21,22,23,24,25


In [37]:
df5.ix['a']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


v    1
w    2
x    3
y    4
z    5
Name: a, dtype: int32

In [38]:
df5.iloc[1]

v     6
w     7
x     8
y     9
z    10
Name: b, dtype: int32

In [39]:
df5.iloc[1,1]

7

In [40]:
df5.iloc[:,1]

a     2
b     7
c    12
d    17
e    22
Name: w, dtype: int32

In [41]:
df5.iloc[:2]

Unnamed: 0,v,w,x,y,z
a,1,2,3,4,5
b,6,7,8,9,10


In [42]:
df5.iloc[1:4,2:]

Unnamed: 0,x,y,z
b,8,9,10
c,13,14,15
d,18,19,20


In [43]:
df5.loc['b']

v     6
w     7
x     8
y     9
z    10
Name: b, dtype: int32

In [44]:
df5.loc[['b','c','e']]

Unnamed: 0,v,w,x,y,z
b,6,7,8,9,10
c,11,12,13,14,15
e,21,22,23,24,25


In [45]:
df5.loc['b','x']

8

In [46]:
df5.loc[:,'x']

a     3
b     8
c    13
d    18
e    23
Name: x, dtype: int32

### Sorting

In [47]:
score_df = DataFrame({'Name':list('abcdefghij'), 'Score 1':np.random.randint(0,101,10)})
score_df

Unnamed: 0,Name,Score 1
0,a,39
1,b,29
2,c,75
3,d,39
4,e,33
5,f,93
6,g,84
7,h,51
8,i,68
9,j,66


In [48]:
score_df.iloc[:2,1] = 100
score_df

Unnamed: 0,Name,Score 1
0,a,100
1,b,100
2,c,75
3,d,39
4,e,33
5,f,93
6,g,84
7,h,51
8,i,68
9,j,66


In [49]:
score_df['Score 1'].sort_values()

4     33
3     39
7     51
9     66
8     68
2     75
6     84
5     93
0    100
1    100
Name: Score 1, dtype: int32

In [50]:
score_df.sort_values(by='Score 1')

Unnamed: 0,Name,Score 1
4,e,33
3,d,39
7,h,51
9,j,66
8,i,68
2,c,75
6,g,84
5,f,93
0,a,100
1,b,100


In [51]:
score_df.sort_values(by='Score 1', ascending=False)

Unnamed: 0,Name,Score 1
0,a,100
1,b,100
5,f,93
6,g,84
2,c,75
8,i,68
9,j,66
7,h,51
3,d,39
4,e,33


### Priority Sorting

In [52]:
score_df['Score 2'] = np.random.randint(0,101,10)
score_df

Unnamed: 0,Name,Score 1,Score 2
0,a,100,0
1,b,100,40
2,c,75,48
3,d,39,24
4,e,33,0
5,f,93,69
6,g,84,36
7,h,51,70
8,i,68,24
9,j,66,57


In [53]:
score_df.sort_values(by=['Score 1'])

Unnamed: 0,Name,Score 1,Score 2
4,e,33,0
3,d,39,24
7,h,51,70
9,j,66,57
8,i,68,24
2,c,75,48
6,g,84,36
5,f,93,69
0,a,100,0
1,b,100,40


In [54]:
score_df.sort_values(by=['Score 1','Score 2'])

Unnamed: 0,Name,Score 1,Score 2
4,e,33,0
3,d,39,24
7,h,51,70
9,j,66,57
8,i,68,24
2,c,75,48
6,g,84,36
5,f,93,69
0,a,100,0
1,b,100,40


In [55]:
score_df.sort_values(by=['Score 1','Score 2'], ascending=[False,False])

Unnamed: 0,Name,Score 1,Score 2
1,b,100,40
0,a,100,0
5,f,93,69
6,g,84,36
2,c,75,48
8,i,68,24
9,j,66,57
7,h,51,70
3,d,39,24
4,e,33,0


In [56]:
score_df2 = score_df.sort_values(by=['Score 1','Score 2'])
score_df2

Unnamed: 0,Name,Score 1,Score 2
4,e,33,0
3,d,39,24
7,h,51,70
9,j,66,57
8,i,68,24
2,c,75,48
6,g,84,36
5,f,93,69
0,a,100,0
1,b,100,40


### Sorting by Index

In [57]:
score_df2.sort_index()

Unnamed: 0,Name,Score 1,Score 2
0,a,100,0
1,b,100,40
2,c,75,48
3,d,39,24
4,e,33,0
5,f,93,69
6,g,84,36
7,h,51,70
8,i,68,24
9,j,66,57


In [58]:
score_df2.sort_index(ascending=False)

Unnamed: 0,Name,Score 1,Score 2
9,j,66,57
8,i,68,24
7,h,51,70
6,g,84,36
5,f,93,69
4,e,33,0
3,d,39,24
2,c,75,48
1,b,100,40
0,a,100,0


In [59]:
# inplace=True parameter will modify original dataframe
score_df.sort_values(by='Score 1',inplace=True)
score_df

Unnamed: 0,Name,Score 1,Score 2
4,e,33,0
3,d,39,24
7,h,51,70
9,j,66,57
8,i,68,24
2,c,75,48
6,g,84,36
5,f,93,69
0,a,100,0
1,b,100,40


### Rank

In [71]:
df1 = DataFrame({'Name':list('abcdefghij'), 'Score 1':np.random.randint(1,101,10)})
df1

Unnamed: 0,Name,Score 1
0,a,26
1,b,42
2,c,37
3,d,79
4,e,9
5,f,47
6,g,42
7,h,99
8,i,17
9,j,25


In [72]:
df1.iloc[2:4,1] = 110
df1

Unnamed: 0,Name,Score 1
0,a,26
1,b,42
2,c,110
3,d,110
4,e,9
5,f,47
6,g,42
7,h,99
8,i,17
9,j,25


In [73]:
# Averaging method
# for equal scores, it is given the average of their ranks
df1['Rank'] = df1['Score 1'].rank()
df1

Unnamed: 0,Name,Score 1,Rank
0,a,26,4.0
1,b,42,5.5
2,c,110,9.5
3,d,110,9.5
4,e,9,1.0
5,f,47,7.0
6,g,42,5.5
7,h,99,8.0
8,i,17,2.0
9,j,25,3.0


In [74]:
# Averaging method; descending
df1['Rank d'] = df1['Score 1'].rank(ascending=False)
df1

Unnamed: 0,Name,Score 1,Rank,Rank d
0,a,26,4.0,7.0
1,b,42,5.5,5.5
2,c,110,9.5,1.5
3,d,110,9.5,1.5
4,e,9,1.0,10.0
5,f,47,7.0,4.0
6,g,42,5.5,5.5
7,h,99,8.0,3.0
8,i,17,2.0,9.0
9,j,25,3.0,8.0


In [75]:
# Min method for ranking
# for equal scores, it is given the min of their ranks
df1['Rank min'] = df1['Score 1'].rank(method='min')
df1

Unnamed: 0,Name,Score 1,Rank,Rank d,Rank min
0,a,26,4.0,7.0,4.0
1,b,42,5.5,5.5,5.0
2,c,110,9.5,1.5,9.0
3,d,110,9.5,1.5,9.0
4,e,9,1.0,10.0,1.0
5,f,47,7.0,4.0,7.0
6,g,42,5.5,5.5,5.0
7,h,99,8.0,3.0,8.0
8,i,17,2.0,9.0,2.0
9,j,25,3.0,8.0,3.0


In [76]:
# Max method for ranking
# for equal scores, it is given the max of their ranks
df1['Rank max'] = df1['Score 1'].rank(method='max')
df1

Unnamed: 0,Name,Score 1,Rank,Rank d,Rank min,Rank max
0,a,26,4.0,7.0,4.0,4.0
1,b,42,5.5,5.5,5.0,6.0
2,c,110,9.5,1.5,9.0,10.0
3,d,110,9.5,1.5,9.0,10.0
4,e,9,1.0,10.0,1.0,1.0
5,f,47,7.0,4.0,7.0,7.0
6,g,42,5.5,5.5,5.0,6.0
7,h,99,8.0,3.0,8.0,8.0
8,i,17,2.0,9.0,2.0,2.0
9,j,25,3.0,8.0,3.0,3.0


In [77]:
# Dense method
# for equal scores, it is given the min of their ranks but it does not skip the next rank
df1['Dense'] = df1['Score 1'].rank(method='dense')
df1

Unnamed: 0,Name,Score 1,Rank,Rank d,Rank min,Rank max,Dense
0,a,26,4.0,7.0,4.0,4.0,4.0
1,b,42,5.5,5.5,5.0,6.0,5.0
2,c,110,9.5,1.5,9.0,10.0,8.0
3,d,110,9.5,1.5,9.0,10.0,8.0
4,e,9,1.0,10.0,1.0,1.0,1.0
5,f,47,7.0,4.0,7.0,7.0,6.0
6,g,42,5.5,5.5,5.0,6.0,5.0
7,h,99,8.0,3.0,8.0,8.0,7.0
8,i,17,2.0,9.0,2.0,2.0,2.0
9,j,25,3.0,8.0,3.0,3.0,3.0


In [69]:
# First method
# for equal scores, it is given different ranks based on which comes first;
df1['First'] = df1['Score 1'].rank(method='first')
df1

KeyError: 'Score 1'

In [70]:
df1 = DataFrame(np.arange(1,10).reshape(3,3))
df1

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [69]:
df1.iloc[2,2] = np.nan
df1 # all the other values in column turns to float as well

Unnamed: 0,0,1,2
0,1,2,3.0
1,4,5,6.0
2,7,8,


## Reading Data from files