# Lightning Talk - iPython Notebook & Pandas

* Perform analysis and easily share results
* Publish to html using nbconvert 
* Supports Markdown, Python
* Use Python packages like Matplotlib, Seaborn to visualize data

In [None]:
#imports
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
import seaborn as sns
from IPython.display import Image
%matplotlib inline
plt.style.use('ggplot')

In [None]:
Image('http://static.boredpanda.com/blog/wp-content/uploads/2015/07/panda-daycare-nursery-chengdu-research-base-breeding-8.jpg')

### Simple Stuff

In [None]:
#https://www.kaggle.com/c/titanic/data
df = pd.read_csv("assets/titanic.csv")
df.head(10)

In [None]:
df.sort_values(by=['age', 'sex'], ascending=False, na_position='first')
#df.sort_values(by=['age', 'sex'], ascending=False)

In [None]:
#df.count() #excludes null values
pd.isnull(df).sum()

In [None]:
df[pd.isnull(df.age)]

In [None]:
df_hasAge = df[pd.notnull(df.age)]
df_hasAge

In [None]:
#df_allNull = df[df.isnull().any(axis=1)]
df_dropNull = df.dropna()
df_dropNull

In [None]:
survived = df_dropNull[(df_dropNull.survived == 1)]
survived.describe()

In [None]:
#survived['age'].plot(kind='hist')
sns.distplot(survived['age'], color='green', norm_hist=True)

In [None]:
#notsurvived = df_dropNull[(df_dropNull.survived == 0)]
#sns.distplot(notsurvived['age'], color='red', norm_hist=True)
sns.distplot(survived['fare'], color='blue', norm_hist=True)

### Replace Null value

In [None]:
#admissions
adf = pd.read_csv("assets/admissions.csv")
adf.count()

In [None]:
#look at what data are missing (using not normalized data)
adf_allNull = adf[adf.isnull().any(axis=1)]
print adf_allNull

In [None]:
#another method trying sklearn library's imputer feature
from sklearn.preprocessing import Imputer

adf_copy = adf.copy(deep=True)

imp_mean = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp_median = Imputer(missing_values='NaN', strategy='median', axis=0)

adf_copy['gre'] = pd.DataFrame(imp_mean.fit_transform(adf_copy['gre'].values.reshape(-1,1)).tolist())
adf_copy['gpa'] = pd.DataFrame(imp_median.fit_transform(adf_copy['gpa'].values.reshape(-1,1)).tolist())
adf_copy['prestige'] = pd.DataFrame(imp_median.fit_transform(adf_copy['prestige'].values.reshape(-1,1)).tolist())
adf_copy.iloc[187]

#adf_corr = adf.corr()
#sns.heatmap(adf.corr(), annot=True, cmap='RdBu')

### Log transform & Linear Regression Plot

In [None]:
#mammal sleep
mammals = pd.read_csv("assets/msleep.csv")
mammals.head()

In [None]:
sns.lmplot('bodywt', 'brainwt', mammals)

In [None]:
log_columns = ['bodywt', 'brainwt']
log_mammals = mammals.copy()
log_mammals[log_columns] = log_mammals[log_columns].apply(np.log10)

In [None]:
sns.lmplot('bodywt', 'brainwt', log_mammals)