# Day 5 of 100DaysOfMachineLearning

### Pandas

In [1]:
#import pandas
import pandas as pd
import numpy as np

#### series object

In [2]:
data = pd.Series([0.2, 0.4, 0.6, 0.8, 1])
data

0    0.2
1    0.4
2    0.6
3    0.8
4    1.0
dtype: float64

In [3]:
data.values

array([0.2, 0.4, 0.6, 0.8, 1. ])

In [4]:
data[0]

0.2

In [5]:
data = pd.Series([0.2, 0.4, 0.6, 0.8, 1], index = ['a', 'b', 'c', 'd', 'e'])
data

a    0.2
b    0.4
c    0.6
d    0.8
e    1.0
dtype: float64

In [6]:
data['b']

0.4

#### data frames

In [7]:
data1 = {'x':100, 'y':200, 'z':300}
data2 = {'x':'a', 'y':'b', 'z':'c'}
data = pd.DataFrame({'data1': data1, 'data2': data2})

In [8]:
data

Unnamed: 0,data1,data2
x,100,a
y,200,b
z,300,c


In [9]:
data.index

Index(['x', 'y', 'z'], dtype='object')

In [10]:
data.columns

Index(['data1', 'data2'], dtype='object')

In [11]:
# creating DataFrame from 2D numpy array
data = pd.DataFrame(np.random.rand(3, 2), columns=['col1', 'col2'], index=['x','y','z'])
data

Unnamed: 0,col1,col2
x,0.637996,0.176789
y,0.904651,0.798579
z,0.874651,0.521631


In [12]:
data['col1']

x    0.637996
y    0.904651
z    0.874651
Name: col1, dtype: float64

In [13]:
data['col1']['x']

0.6379958760772527

### Data indexing and selection

#### Data selection in Series

In [14]:
data = pd.Series([0.25,0.5,0.75,1.0], index=['p','q','r','s'])
data

p    0.25
q    0.50
r    0.75
s    1.00
dtype: float64

In [15]:
data['r']

0.75

In [16]:
'q' in data

True

In [17]:
data.keys()

Index(['p', 'q', 'r', 's'], dtype='object')

In [18]:
data['t'] = 1.25
data

p    0.25
q    0.50
r    0.75
s    1.00
t    1.25
dtype: float64

In [19]:
#slicing data by explicit index
data['q':'t']

q    0.50
r    0.75
s    1.00
t    1.25
dtype: float64

In [20]:
#slicing data by implicit integer index
data[1:3]

q    0.50
r    0.75
dtype: float64

In [21]:
#masking
data[(data >0.4) & (data<1.1)]

q    0.50
r    0.75
s    1.00
dtype: float64

In [22]:
#fancy index
data[['q','s']]

q    0.5
s    1.0
dtype: float64

#### loc, iloc and ix indexers.

loc : explicit index

iloc : implicit integer index

In [23]:
data = pd.Series([0.25,0.5,0.75,1.0], index=[1, 2, 3, 4])
data

1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [24]:
#explicit index
data.loc[1]

0.25

In [25]:
#implicit index
data.iloc[1]

0.5

#### Data selection in DataFrame

In [26]:
area = pd.Series({'place1' : 412, 'place2' : 122, 'place3' : 221, 'place4' : 120, 'place5' : 231})
population = pd.Series({'place1' : 19993, 'place2' : 32011, 'place3' : 9854, 'place4' : 7898, 'place5' : 39093})

data = pd.DataFrame({'area':area, 'population':population})
data

Unnamed: 0,area,population
place1,412,19993
place2,122,32011
place3,221,9854
place4,120,7898
place5,231,39093


In [27]:
data['area']

place1    412
place2    122
place3    221
place4    120
place5    231
Name: area, dtype: int64

In [28]:
data.area

place1    412
place2    122
place3    221
place4    120
place5    231
Name: area, dtype: int64

In [29]:
data['density'] = data['population']/data['area']
data

Unnamed: 0,area,population,density
place1,412,19993,48.526699
place2,122,32011,262.385246
place3,221,9854,44.588235
place4,120,7898,65.816667
place5,231,39093,169.233766


In [30]:
data.values

array([[  412.        , 19993.        ,    48.52669903],
       [  122.        , 32011.        ,   262.3852459 ],
       [  221.        ,  9854.        ,    44.58823529],
       [  120.        ,  7898.        ,    65.81666667],
       [  231.        , 39093.        ,   169.23376623]])

In [31]:
data.T #transpose of data-frame -- swap rows and columns

Unnamed: 0,place1,place2,place3,place4,place5
area,412.0,122.0,221.0,120.0,231.0
population,19993.0,32011.0,9854.0,7898.0,39093.0
density,48.526699,262.385246,44.588235,65.816667,169.233766


In [32]:
data.values[0]

array([  412.        , 19993.        ,    48.52669903])

In [33]:
data.values[0,1]

19993.0

In [34]:
data.iloc[:2, 1:3]

Unnamed: 0,population,density
place1,19993,48.526699
place2,32011,262.385246


In [35]:
data.loc[:'place3', 'population':]

Unnamed: 0,population,density
place1,19993,48.526699
place2,32011,262.385246
place3,9854,44.588235


In [36]:
data.loc[data.density>100, 'population':]

Unnamed: 0,population,density
place2,32011,262.385246
place5,39093,169.233766


In [37]:
data

Unnamed: 0,area,population,density
place1,412,19993,48.526699
place2,122,32011,262.385246
place3,221,9854,44.588235
place4,120,7898,65.816667
place5,231,39093,169.233766


In [38]:
data.iloc[0,2] = 20
data

Unnamed: 0,area,population,density
place1,412,19993,20.0
place2,122,32011,262.385246
place3,221,9854,44.588235
place4,120,7898,65.816667
place5,231,39093,169.233766
