# Python for data science essential training pt. 1
## 2. Data preparation basics

### 2.1 Filtering & selecting data

In [4]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [15]:
# create a series object and assign index labels 
series_obj = Series(np.arange(8), index = ['row1', 'row2', 'row3', 'row4', 'row5', 'row6', 'row7', 'row8'])
series_obj

row1    0
row2    1
row3    2
row4    3
row5    4
row6    5
row7    6
row8    7
dtype: int32

In [19]:
# this allows you to index using the index labels, e.g.
series_obj['row7']

6

In [22]:
# can also index just using integers, like so (note the double square brackets):
series_obj[[0, 7]]

row1    0
row8    7
dtype: int32

In [42]:
np.random.seed(25) #set seed
df_obj = DataFrame(np.random.rand(36).reshape(6,6), #generates 36 random numbers; reshape ensures 6 rows x 6 columns
                  index = ['row1', 'row2', 'row3', 'row4', 'row5', 'row6'], #name rows
                   columns = ['column1', 'column2', 'column3', 'column4', 'column5', 'column7'] #name columns
                  )

df_obj

Unnamed: 0,column1,column2,column3,column4,column5,column7
row1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row3,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row5,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
row6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [45]:
# use '.loc' method to pull out values from particular rows & columns
df_obj.loc[ ['row1', 'row2'], ['column1', 'column2'] ]

Unnamed: 0,column1,column2
row1,0.870124,0.582277
row2,0.684969,0.437611


### Data slicing

#### Comparison with scalars

In [46]:
# can slice using index labels
series_obj['row3':'row7']

row3    2
row4    3
row5    4
row6    5
row7    6
dtype: int32

In [50]:
# can use boolean operators
df_obj < .2

Unnamed: 0,column1,column2,column3,column4,column5,column7
row1,False,False,False,True,False,True
row2,False,False,False,False,False,True
row3,False,False,True,False,False,False
row4,False,False,False,False,False,False
row5,False,False,True,False,False,False
row6,False,False,False,False,False,False


#### Filtering with scalars

In [55]:
# just like in R, can filter with square brackets
series_obj [series_obj > 6]

row8    7
dtype: int32

#### Setting values with scalars

In [57]:
series_obj['row1', 'row5', 'row8'] = 8
series_obj

row1    8
row2    1
row3    2
row4    3
row5    8
row6    5
row7    6
row8    8
dtype: int32

In [81]:
df_obj[0:2] # get first 2 rows

Unnamed: 0,column1,column2,column3,column4,column5,column7
row1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041


In [89]:
df_obj.loc[:'row2', 'column1' : 'column5']  #get rows up to row2 & columns 1 to 5

Unnamed: 0,column1,column2,column3,column4,column5
row1,0.870124,0.582277,0.278839,0.185911,0.4111
row2,0.684969,0.437611,0.556229,0.36708,0.402366
