In [1]:
import numpy as np
import pandas as pd

# Manipulating and querying data

## Constructing a DataFrame

In [2]:
subject = ['ann', 'bob', 'charles', 'david', 'ella']
sex = ['F', 'M', 'M', 'M', 'F']
group = ['A', 'A', 'A', 'B', 'B']
age = [23, 34, 27, 30, 27]
score = [89, 90, 78, 98, 90]
df = pd.DataFrame(data = {'name': subject, 'sex': sex, 'age': age, 'group': group, 'score': score})

In [3]:
df

Unnamed: 0,age,group,name,score,sex
0,23,A,ann,89,F
1,34,A,bob,90,M
2,27,A,charles,78,M
3,30,B,david,98,M
4,27,B,ella,90,F


## Getting basic information

In [4]:
df.dtypes

age       int64
group    object
name     object
score     int64
sex      object
dtype: object

In [5]:
df.shape

(5, 5)

In [6]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
df.columns

Index(['age', 'group', 'name', 'score', 'sex'], dtype='object')

In [8]:
df = df[['name', 'sex', 'age', 'group', 'score']]
df

Unnamed: 0,name,sex,age,group,score
0,ann,F,23,A,89
1,bob,M,34,A,90
2,charles,M,27,A,78
3,david,M,30,B,98
4,ella,F,27,B,90


## Row subsets

In [9]:
df.head(2)

Unnamed: 0,name,sex,age,group,score
0,ann,F,23,A,89
1,bob,M,34,A,90


In [10]:
df.tail(2)

Unnamed: 0,name,sex,age,group,score
3,david,M,30,B,98
4,ella,F,27,B,90


In [11]:
df.sample(2)

Unnamed: 0,name,sex,age,group,score
0,ann,F,23,A,89
2,charles,M,27,A,78


## Column subsets

In [12]:
df['name']

0        ann
1        bob
2    charles
3      david
4       ella
Name: name, dtype: object

In [13]:
df.name

0        ann
1        bob
2    charles
3      david
4       ella
Name: name, dtype: object

In [14]:
df[[0]]

Unnamed: 0,name
0,ann
1,bob
2,charles
3,david
4,ella


In [15]:
df[['age', 'sex']]

Unnamed: 0,age,sex
0,23,F
1,34,M
2,27,M
3,30,M
4,27,F


In [16]:
df[[1,2]]

Unnamed: 0,sex,age
0,F,23
1,M,34
2,M,27
3,M,30
4,F,27


## Indexing

In [17]:
df.ix[0,0]

'ann'

In [18]:
df.ix[1,1]

'M'

In [19]:
df.ix[1:4, :2]

Unnamed: 0,name,sex
1,bob,M
2,charles,M
3,david,M
4,ella,F


In [20]:
df.ix[[1,3,4], ['age', 'name']]

Unnamed: 0,age,name
1,34,bob
3,30,david
4,27,ella


## Logical indexing

In [21]:
df[df.age > 25]

Unnamed: 0,name,sex,age,group,score
1,bob,M,34,A,90
2,charles,M,27,A,78
3,david,M,30,B,98
4,ella,F,27,B,90


In [22]:
df[(df.age > 25) & (df.sex == 'M')]

Unnamed: 0,name,sex,age,group,score
1,bob,M,34,A,90
2,charles,M,27,A,78
3,david,M,30,B,98


## Sorting

In [23]:
df.sort_values('age')

Unnamed: 0,name,sex,age,group,score
0,ann,F,23,A,89
2,charles,M,27,A,78
4,ella,F,27,B,90
3,david,M,30,B,98
1,bob,M,34,A,90


In [24]:
df.sort_values('age', ascending = False)

Unnamed: 0,name,sex,age,group,score
1,bob,M,34,A,90
3,david,M,30,B,98
2,charles,M,27,A,78
4,ella,F,27,B,90
0,ann,F,23,A,89


In [25]:
df.sort_values(['age', 'score'], ascending = ['True', 'True'])

Unnamed: 0,name,sex,age,group,score
0,ann,F,23,A,89
2,charles,M,27,A,78
4,ella,F,27,B,90
3,david,M,30,B,98
1,bob,M,34,A,90


In [26]:
df.sort_values(['age', 'score'], ascending = ['True', 'False'], inplace = True)
df

Unnamed: 0,name,sex,age,group,score
0,ann,F,23,A,89
2,charles,M,27,A,78
4,ella,F,27,B,90
3,david,M,30,B,98
1,bob,M,34,A,90


In [27]:
df.sort_index()

Unnamed: 0,name,sex,age,group,score
0,ann,F,23,A,89
1,bob,M,34,A,90
2,charles,M,27,A,78
3,david,M,30,B,98
4,ella,F,27,B,90


## Transformation

In [28]:
df['log_score'] = np.log(df['score'])
df

Unnamed: 0,name,sex,age,group,score,log_score
0,ann,F,23,A,89,4.488636
2,charles,M,27,A,78,4.356709
4,ella,F,27,B,90,4.49981
3,david,M,30,B,98,4.584967
1,bob,M,34,A,90,4.49981


In [29]:
df['adjusted_score'] = np.where(df.sex == 'M', df.score - 10, df.score)
df

Unnamed: 0,name,sex,age,group,score,log_score,adjusted_score
0,ann,F,23,A,89,4.488636,89
2,charles,M,27,A,78,4.356709,68
4,ella,F,27,B,90,4.49981,90
3,david,M,30,B,98,4.584967,88
1,bob,M,34,A,90,4.49981,80


## String Operations

In [30]:
df.name.str[:3]

0    ann
2    cha
4    ell
3    dav
1    bob
Name: name, dtype: object

In [31]:
df.name.str.upper()

0        ANN
2    CHARLES
4       ELLA
3      DAVID
1        BOB
Name: name, dtype: object

In [32]:
df.name.str.replace('a', 'A')

0        Ann
2    chArles
4       ellA
3      dAvid
1        bob
Name: name, dtype: object

## Summary Statistics

In [33]:
df.mean()

age               28.200000
score             89.000000
log_score          4.485986
adjusted_score    83.000000
dtype: float64

In [34]:
df.score.var()

51.0

In [35]:
df.count()

name              5
sex               5
age               5
group             5
score             5
log_score         5
adjusted_score    5
dtype: int64

In [36]:
df.describe()

Unnamed: 0,age,score,log_score,adjusted_score
count,5.0,5.0,5.0,5.0
mean,28.2,89.0,4.485986,83.0
std,4.086563,7.141428,0.082005,9.273618
min,23.0,78.0,4.356709,68.0
25%,27.0,89.0,4.488636,80.0
50%,27.0,90.0,4.49981,88.0
75%,30.0,90.0,4.49981,89.0
max,34.0,98.0,4.584967,90.0


## Split-Apply-Combine

In [37]:
grouped = df.groupby('sex')
grouped.groups

{'F': [0, 4], 'M': [2, 3, 1]}

In [38]:
df.groupby('sex').count()

Unnamed: 0_level_0,name,age,group,score,log_score,adjusted_score
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,2,2,2,2,2,2
M,3,3,3,3,3,3


In [39]:
df.groupby('sex').mean()

Unnamed: 0_level_0,age,score,log_score,adjusted_score
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,25.0,89.5,4.494223,89.5
M,30.333333,88.666667,4.480495,78.666667


In [40]:
df.groupby(['sex', 'group']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,score,log_score,adjusted_score
sex,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,A,23.0,89.0,4.488636,89.0
F,B,27.0,90.0,4.49981,90.0
M,A,30.5,84.0,4.428259,74.0
M,B,30.0,98.0,4.584967,88.0


In [41]:
df.groupby(['sex', 'group']).agg(['count', 'mean', 'sum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age,score,score,score,log_score,log_score,log_score,adjusted_score,adjusted_score,adjusted_score
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,sum,count,mean,sum,count,mean,sum,count,mean,sum
sex,group,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
F,A,1,23.0,23,1,89,89,1,4.488636,4.488636,1,89,89
F,B,1,27.0,27,1,90,90,1,4.49981,4.49981,1,90,90
M,A,2,30.5,61,2,84,168,2,4.428259,8.856518,2,74,148
M,B,1,30.0,30,1,98,98,1,4.584967,4.584967,1,88,88


In [42]:
grouped[['age', 'score']].agg('mean')

Unnamed: 0_level_0,age,score
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,25.0,89.5
M,30.333333,88.666667


In [43]:
grouped.get_group('F')

Unnamed: 0,name,sex,age,group,score,log_score,adjusted_score
0,ann,F,23,A,89,4.488636,89
4,ella,F,27,B,90,4.49981,90


In [44]:
for name, group in df.groupby('sex'):
    print(name)
    print(group.sum())
    print()

F
name              annella
sex                    FF
age                    50
group                  AB
score                 179
log_score         8.98845
adjusted_score        179
dtype: object

M
name              charlesdavidbob
sex                           MMM
age                            91
group                         ABA
score                         266
log_score                 13.4415
adjusted_score                236
dtype: object



## Pivot tables

In [45]:
from collections import OrderedDict

In [46]:
d = OrderedDict()
d['pid'] = ['1', '1', '1', '1', '1', '1', '2', '2', '2', '2']
d['visit'] = ['1', '1', '2', '2', '3', '3', '1', '1', '2', '2']
d['stim'] = ['cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)

In [47]:
df.head()

Unnamed: 0,pid,visit,stim,tnf,ifn,il2
0,1,1,cmv,1.0,11.0,0.0
1,1,1,hiv,2.0,12.0,0.0
2,1,2,cmv,1.1,11.1,0.1
3,1,2,hiv,2.1,12.1,0.1
4,1,3,cmv,1.2,11.2,0.2


In [48]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['stim'], aggfunc = 'mean')

Unnamed: 0_level_0,ifn,il2,tnf
stim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cmv,11.88,0.1,1.88
hiv,12.88,0.14,2.88


In [49]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid'], aggfunc = 'mean')

Unnamed: 0_level_0,ifn,il2,tnf
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11.6,0.1,1.6
2,13.55,0.15,3.55


In [50]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'])

Unnamed: 0_level_0,Unnamed: 1_level_0,ifn,il2,tnf
pid,stim,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,cmv,11.1,0.1,1.1
1,hiv,12.1,0.1,2.1
2,cmv,13.05,0.1,3.05
2,hiv,14.05,0.2,4.05


In [51]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'], aggfunc = 'count')

Unnamed: 0_level_0,Unnamed: 1_level_0,ifn,il2,tnf
pid,stim,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,cmv,3,3,3
1,hiv,3,3,3
2,cmv,2,2,2
2,hiv,2,2,2


## Exercises

**1a**. Create two DataFrames from the files `data/set1.csv` and `data/set2.csv` and save as `df1` and `df2` respectively.

**1b**. Show the top and bottom 6 rows from `df1` and `df2`.

**1c**. Create a new DataFrame `df1_wide` from `df1` so that there are two new columns in place of (gene1, gene2 and gene3). The first new column should be named `variable` and contain the names of the gene, and the second should be named `activity` and contain the value. This should have 60 rows.

**1d**. Merge `df1` and `df2` into a single DataFrame named `df`. This should have 7 columns.

**1e**. Find the average values of `age`, `gene1`, `gene2` and `gene3 `grouping by `time`.

**1f**. Sort `df1` by `gene1` in from largest to smallest.

**1g**. Sort `df1` by the average value across `gene1`, `gene2` and `gene3 ` from smallest to largest.

**1h**. Insert a new column `lr_2_1` inot `df1` that is the log ratio of `gene2` to `gene1`.