# Pandas

Pandas provides data frames (similar to those in R) for python.

In [None]:
import pandas
import numpy as np
%matplotlib inline

## Series

In [None]:
series = pandas.Series(np.random.standard_normal(10))

In [None]:
series

In [None]:
series[5]

In [None]:
series[5:7]

In [None]:
series.index = (list('abcdefghij'))

In [None]:
series

In [None]:
series['a']

In [None]:
pandas.Series({'a': 12, 'b': -12})

In [None]:
series['a':'d']

In [None]:
series + 2

In [None]:
series * series

In [None]:
series + pandas.Series([1,2], index=['a', 'z'])

## Data Frame

In [None]:
fish = pandas.DataFrame({'size': [100, 120, 70],
                         'weight': [20, 30, 25]},
                index = ['Brown Troupt', 'Atlantic Salmon', 'Chinook Salmon'])

In [None]:
fish

In [None]:
fish[fish.weight > 20]

In [None]:
fish[fish.weight < 30]['size']

## Reading data

In [None]:
for i in dir(pandas):
    if i.startswith('read'):
        print i

In [None]:
# Has also as_ methods

In [None]:
df = pandas.read_csv('data/201508_trip_data.csv.gz')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.Duration.plot.hist()

In [None]:
df[df.Duration < 60*45].Duration.plot.hist(bins = 30)

In [None]:
import datetime
def myd_hm(datetimestring):
    return datetime.datetime.strptime(datetimestring, '%m/%d/%Y %H:%M')
df['Start Date'] = df['Start Date'].apply(myd_hm)

In [None]:
stations = list(df.groupby('Start Station')['Duration'].count().sort_values().tail().index)

In [None]:
stations

In [None]:
df = df[df['Start Station'].apply(lambda x: x in stations)]

In [None]:
departures = df[['Start Station', 'Duration', 'Start Date']]

In [None]:
departures.head()

# Pivoting / Stacking

In [None]:
fish

In [None]:
fish.stack()

In [None]:
stacked = fish.stack().reset_index()

In [None]:
stacked

In [None]:
stacked.columns = ['name', 'info', 'value']

In [None]:
stacked

In [None]:
stacked.pivot(index='name', columns='info', values='value')

In [None]:
pivoted = departures.pivot_table(index='Start Date', columns='Start Station', values='Duration')

In [None]:
pivoted.head()

## Time Series

In [None]:
daily_averages = pivoted.resample('1d').mean()

In [None]:
daily_averages.head()

In [None]:
daily_averages['2014'].head()

In [None]:
daily_averages['2014-10'].head()

In [None]:
daily_averages['2014-10-15':'2014-10-20']

## Groupby

In [None]:
groupby_example = pandas.DataFrame({'key': ['a', 'b', 'a', 'b'], 'value': [1,2,1,2]})

In [None]:
groupby_example

In [None]:
groupby_example.groupby('key').mean()

In [None]:
groupby_example.groupby('key').sum()

In [None]:
daily_averages.index

In [None]:
daily_averages['Weekday'] = daily_averages.index.weekday

In [None]:
mean_weekday = daily_averages.groupby('Weekday').mean()

In [None]:
mean_weekday.plot(kind='bar')

In [None]:
import calendar

In [None]:
daily_averages['Weekday'] = daily_averages['Weekday'].apply(lambda x: calendar.day_abbr[x])

In [None]:
daily_averages.groupby('Weekday').mean().plot.bar()

In [None]:
means = daily_averages.groupby('Weekday').mean()

In [None]:
means.index

In [None]:
reverse_days = {calendar.day_abbr[i]: i for i in range(7)}

In [None]:
daynums = [reverse_days[i] for i in means.index]

In [None]:
import numpy as np

In [None]:
np.argsort(daynums)

In [None]:
means.iloc[np.argsort(daynums)].plot.bar()

In [None]:
daily_averages['Month'] = daily_averages.index.month

In [None]:
import matplotlib.pyplot as plt

In [None]:
daily_averages.groupby('Month').mean().plot.bar()

In [None]:
daily_averages.groupby('Month').mean().plot.bar(ylim=(0, 3000), figsize=(12,10))

In [None]:
df.Duration.hist()

In [None]:
df[df.Duration > 1e6]

In [None]:
df.Duration.idxmax()

In [None]:
df.loc[df.Duration.idxmax()]

In [None]:
df.Duration[df.Duration < 60*30].hist(bins = 30)

In [None]:
df['Start Date'].apply(lambda x: x.hour).hist(bins=24)

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 14, 10

In [None]:
(df
 .groupby('Start Station', as_index=False)['Duration']
 .median()
 .plot
 .bar(x = 'Start Station', y = 'Duration'))

In [None]:
subs_fraction = df\
    .groupby("Start Station")\
    .apply(lambda x: sum(x["Subscriber Type"] == "Subscriber") / float(len(x)))

In [None]:
subs_fraction.head()

In [None]:
subs_fraction.hist(bins=30)

In [None]:
subs_fraction.idxmin()

In [None]:
subs_fraction.idxmax()