In [1]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle on/off the underlying python code, click <a href="javascript:code_toggle()">here</a>.''')

# Import modules
- Pandas is third party: Defines dataframe data structure
- os is in standard library

In [2]:
import pandas as pd
import os

# Load Data
- Data is stored in .csv file in data directory

In [3]:
DATA_DIR = os.path.join(os.getcwd(), os.pardir)
iris_df = pd.read_csv(os.path.join(DATA_DIR, 'data', 'iris.data'))
iris_df.head()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Inspect Data

- pandas offers descriptive stats for numerical columns
- show describe method on full data set

In [4]:
iris_df.describe()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## Calculate Stats by Species
- indexed by species and statistic for simple lookup

In [5]:
def describe_all(iris_df):
    ''' return descriptive stats dataframe by species and stat'''
    df_list = list()
    for species in iris_df['class'].unique():
        species_subset = iris_df[iris_df['class'] == species].describe()
        species_subset['class'] = species
        df_list.append(species_subset)
        
    return pd.concat(df_list).reset_index().rename(columns={'index':'stat'}).set_index(['class', 'stat'])


descriptive_stats = describe_all(iris_df)
descriptive_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal_len,sepal_width,petal_len,petal_width
class,stat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,count,50.0,50.0,50.0,50.0
Iris-setosa,mean,5.006,3.418,1.464,0.244
Iris-setosa,std,0.35249,0.381024,0.173511,0.10721
Iris-setosa,min,4.3,2.3,1.0,0.1
Iris-setosa,25%,4.8,3.125,1.4,0.2
Iris-setosa,50%,5.0,3.4,1.5,0.2
Iris-setosa,75%,5.2,3.675,1.575,0.3
Iris-setosa,max,5.8,4.4,1.9,0.6
Iris-versicolor,count,50.0,50.0,50.0,50.0
Iris-versicolor,mean,5.936,2.77,4.26,1.326


# indexing location
- demonstrate how multi index helps organize data

In [6]:
descriptive_stats.loc[('Iris-virginica','std'),'petal_width']

0.27465005563666739

In [7]:
descriptive_stats.loc[('Iris-virginica',['min', 'max']),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal_len,sepal_width,petal_len,petal_width
class,stat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-virginica,min,4.9,2.2,4.5,1.4
Iris-virginica,max,7.9,3.8,6.9,2.5
