# Introduction to `pandas`

In [1]:
np.random.seed(123)

## Series

### Numeric data

In [2]:
age = pd.Series([23, 17, 22, 37, 42], name='years')
age

0    23
1    17
2    22
3    37
4    42
Name: years, dtype: int64

In [3]:
type(age)

pandas.core.series.Series

In [4]:
age.index

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [5]:
age.sort_values()

1    17
2    22
0    23
3    37
4    42
Name: years, dtype: int64

In [6]:
age.nsmallest(3)

1    17
2    22
0    23
Name: years, dtype: int64

In [7]:
age.values

array([23, 17, 22, 37, 42])

### String data

In [8]:
species = pd.Series(['mouse', 'mouse', 'human', 'human', 'mouse', 'mouse'])

In [9]:
species.sort_values()

2    human
3    human
0    mouse
1    mouse
4    mouse
5    mouse
dtype: object

In [10]:
species.unique()

array(['mouse', 'human'], dtype=object)

In [11]:
species.str.title()

0    Mouse
1    Mouse
2    Human
3    Human
4    Mouse
5    Mouse
dtype: object

In [12]:
species.str[2:4]

0    us
1    us
2    ma
3    ma
4    us
5    us
dtype: object

In [13]:
species.replace({'mosue': 'mus musculus', 'human': 'homo sapiens'})

0           mouse
1           mouse
2    homo sapiens
3    homo sapiens
4           mouse
5           mouse
dtype: object

### Cateogrical data

In [14]:
species = species.astype('category')

In [15]:
species.cat.codes

0    1
1    1
2    0
3    0
4    1
5    1
dtype: int8

In [16]:
species.cat.categories

Index(['human', 'mouse'], dtype='object')

## DataFrame

### Read CSV

In [17]:
iris_1 = pd.read_csv('iris.csv')

In [18]:
iris_1.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Write CSV

In [19]:
iris_1.to_csv('iris_1.csv', index=False)

### Read Excel

In [20]:
iris_2 = pd.read_excel('iris.xlsx')

In [21]:
iris_2.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Write Excel

In [22]:
iris_2.to_excel('iris_2.xlsx', index=False)

### Check files using Unix shell commands

In [23]:
! ls

Data Manipulation.ipynb      petal.csv
Introduction to Pandas.ipynb schhedule.md
iris.csv                     sepal.csv
iris.xlsx                    setosa.csv
iris_1.csv                   versicolor.csv
iris_2.xlsx                  virginica.csv


In [24]:
! head -n 5 iris_1.csv | cat

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa


## Combining data sets

### Combining rows

In [25]:
df_versiocolor = pd.read_csv('versicolor.csv')
df_virginica = pd.read_csv('virginica.csv')
df_sertosa = pd.read_csv('setosa.csv')
dfs = [df_versiocolor, df_virginica, df_sertosa]

In [26]:
[df.shape for df in dfs]

[(50, 5), (50, 5), (50, 5)]

#### Each DataFrame only contains data about one species

In [27]:
for df in dfs:
    print(df.Species.unique())

['versicolor']
['virginica']
['setosa']


#### Combine with `concat`

In [28]:
df = pd.concat(dfs)
df.shape

(150, 5)

#### Combined DataFrame contains all 3 species

In [29]:
df.Species.unique()

array(['versicolor', 'virginica', 'setosa'], dtype=object)

### Combining columns

In [30]:
df_sepal = pd.read_csv('sepal.csv')
df_petal = pd.read_csv('petal.csv')

In [31]:
df_sepal.head(3)

Unnamed: 0,Species,Sepal.Length,Sepal.Width
0,setosa,5.1,3.5
1,setosa,4.9,3.0
2,setosa,4.7,3.2


In [32]:
df_petal.head(3)

Unnamed: 0,Species,Petal.Length,Petal.Width
0,setosa,1.4,0.2
1,setosa,1.4,0.2
2,setosa,1.3,0.2


In [33]:
df_sepal.shape, df_petal.shape

((150, 3), (150, 3))

In [34]:
df = pd.merge(df_sepal, df_petal, on = 'Species', left_index=True, right_index=True)
df.shape

(150, 5)

In [35]:
df.head(3)

Unnamed: 0,Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2


#### Joininig on a single unique column

Combining values for the same subject across different messurements.

In [36]:
pid1 = np.random.choice(100, 6, replace=False)
pid1

array([ 8, 70, 82, 28, 63,  0])

In [37]:
val1 = np.random.normal(10, 1, 6)
val1

array([ 10.46843912,   9.16884502,  11.16220405,   8.90279695,
         7.87689965,  11.03972709])

In [38]:
df1 = pd.DataFrame({'pid': pid1, 'val': val1})
df1

Unnamed: 0,pid,val
0,8,10.468439
1,70,9.168845
2,82,11.162204
3,28,8.902797
4,63,7.8769
5,0,11.039727


In [39]:
pid2 = np.random.permutation(pid1)
pid2

array([28, 82,  8, 70, 63,  0])

In [40]:
val2 = np.random.normal(15, 1, 6)
val2

array([ 14.16248328,  13.39403724,  16.25523737,  14.31113102,
        16.66095249,  15.80730819])

In [41]:
df2 = pd.DataFrame({'pid': pid2, 'val': val2})
df2

Unnamed: 0,pid,val
0,28,14.162483
1,82,13.394037
2,8,16.255237
3,70,14.311131
4,63,16.660952
5,0,15.807308


In [42]:
pd.merge(df1, df2, on='pid', suffixes=['_visit_1', '_visit_2'])

Unnamed: 0,pid,val_visit_1,val_visit_2
0,8,10.468439,16.255237
1,70,9.168845,14.311131
2,82,11.162204,13.394037
3,28,8.902797,14.162483
4,63,7.8769,16.660952
5,0,11.039727,15.807308


#### Joining on two unique columns

In [43]:
df1['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)
df1 = df1[['pid', 'stim', 'val']]
df1

Unnamed: 0,pid,stim,val
0,8,cmv,10.468439
1,70,flu,9.168845
2,82,cmv,11.162204
3,28,cmv,8.902797
4,63,cmv,7.8769
5,0,cmv,11.039727


In [44]:
df2['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)
df2 = df2[['pid', 'stim', 'val']]
df2

Unnamed: 0,pid,stim,val
0,28,flu,14.162483
1,82,cmv,13.394037
2,8,flu,16.255237
3,70,flu,14.311131
4,63,cmv,16.660952
5,0,cmv,15.807308


In [45]:
pd.merge(df1, df2, on = ['pid', 'stim'], suffixes = ['_visit_1', '_visit_2'])

Unnamed: 0,pid,stim,val_visit_1,val_visit_2
0,70,flu,9.168845,14.311131
1,82,cmv,11.162204,13.394037
2,63,cmv,7.8769,16.660952
3,0,cmv,11.039727,15.807308


In [46]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'left', suffixes = ['_visit_1', '_visit_2'])

Unnamed: 0,pid,stim,val_visit_1,val_visit_2
0,8,cmv,10.468439,
1,70,flu,9.168845,14.311131
2,82,cmv,11.162204,13.394037
3,28,cmv,8.902797,
4,63,cmv,7.8769,16.660952
5,0,cmv,11.039727,15.807308


In [47]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'right', suffixes = ['_visit_1', '_visit_2'])

Unnamed: 0,pid,stim,val_visit_1,val_visit_2
0,70,flu,9.168845,14.311131
1,82,cmv,11.162204,13.394037
2,63,cmv,7.8769,16.660952
3,0,cmv,11.039727,15.807308
4,28,flu,,14.162483
5,8,flu,,16.255237


In [48]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'outer', suffixes = ['_visit_1', '_visit_2'])

Unnamed: 0,pid,stim,val_visit_1,val_visit_2
0,8,cmv,10.468439,
1,70,flu,9.168845,14.311131
2,82,cmv,11.162204,13.394037
3,28,cmv,8.902797,
4,63,cmv,7.8769,16.660952
5,0,cmv,11.039727,15.807308
6,28,flu,,14.162483
7,8,flu,,16.255237


## Reshaping `DataFrame`

## Hierarchical Indices

In [49]:
%load_ext version_information

In [50]:
%version_information

Software,Version
Python,3.5.1 64bit [GCC 4.2.1 (Apple Inc. build 5577)]
IPython,4.1.2
OS,Darwin 15.6.0 x86_64 i386 64bit
Sun Aug 14 23:09:51 2016 EDT,Sun Aug 14 23:09:51 2016 EDT
