## Lesson 7 - NumPy and Pandas for 2D Data

## Two-Dimensional NumPy Arrays

In [1]:
import numpy as np

print '\nSubway ridership for 5 stations on 10 different days:'
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

# Change False to True for each block of code to see what it does

print '\nAccessing elements:'
if True:
    print ridership[1, 3]
    print ridership[1:3, 3:5]
    print ridership[1, :]
    
print '\nVectorized operations on rows or columns:'
if True:
    print ridership[0, :] + ridership[1, :]
    print ridership[:, 0] + ridership[:, 1]
    
print '\nVectorized operations on entire arrays:'
if True:
    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    print a + b

def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day (!?, over all days is the correct term) for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    max_station_position = ridership[0, :].argmax()
    overall_mean = ridership.mean() # Replace this with your code
    mean_for_max = ridership[:, max_station_position].mean() # Replace this with your code
    
    return (overall_mean, mean_for_max)

print '\nmean_riders_for_max_station(ridership):'
print mean_riders_for_max_station(ridership)


Subway ridership for 5 stations on 10 different days:

Accessing elements:
2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]

Vectorized operations on rows or columns:
[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]

Vectorized operations on entire arrays:
[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]

mean_riders_for_max_station(ridership):
(2342.5999999999999, 3239.9000000000001)


## NumPy Axis

In [3]:
# NumPy axis argument
if True:
    a = np.array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    
    print a.sum()
    print a.sum(axis=0)
    print a.sum(axis=1)
    
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day (!?, over all days is the correct term). Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    mean_ridership_over_days = ridership.mean(axis = 0)
    max_daily_ridership = mean_ridership_over_days.max() 
    # ~! Axis 0 runs over rows, Axis 1 runs over columns, opposite of 0,1 
    # or according to vectors = columns in Linear Algebra 
    
    min_daily_ridership = mean_ridership_over_days.min()
    
    return (max_daily_ridership, min_daily_ridership)

print '\nmin_and_max_riders_per_day(ridership):'
print min_and_max_riders_per_day(ridership)

45
[12 15 18]
[ 6 15 24]

min_and_max_riders_per_day(ridership):
(3239.9000000000001, 1071.2)


## NumPy and Pandas Data Types

In [13]:
import pandas as pd

# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

# Change False to True for each block of code to see what it does

print '\nDataFrame creation:'
if True:
    # You can create a DataFrame out of a dictionary mapping column names to values
    df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df_1

    # You can also use a list of lists or a 2D NumPy array
    df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
    print df_2
   

print '\nAccessing elements:'
if True:
    print ridership_df.iloc[0]
    print ridership_df.loc['05-05-11']
    print ridership_df['R003']
    print ridership_df.iloc[1, 3]
    
print '\nAccessing multiple rows:'
if True:
    print ridership_df.iloc[1:4]
    
print '\nAccessing multiple columns:'
if True:
    print ridership_df[['R003', 'R005']]
    
print '\nPandas axis:'
if True:
    df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df.sum()
    print df.sum(axis=1)
    print df.values.sum()
    
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day (!?, over all days is the correct term) for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    max_station_position = ridership.iloc[0].idxmax()
    overall_mean = ridership.values.mean() # Replace this with your code
    mean_for_max = ridership[max_station_position].mean() # Replace this with your code
    
    return (overall_mean, mean_for_max)

print '\nmean_riders_for_max_station(ridership_df):'
print mean_riders_for_max_station(ridership_df)


DataFrame creation:
   A  B
0  0  3
1  1  4
2  2  5
   A  B  C
0  0  1  2
1  3  4  5

Accessing elements:
R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64
R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64
05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64
2328

Accessing multiple rows:
          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613

Accessing multiple columns:
          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009

Pandas axis:
A     3
B    12
dtype: int64
0    3
1    5
2    7
dtype: int64
1

## Calculating Correlation, Pearson's r

In [22]:
filename = 'nyc-subway-weather.csv'
subway_df = pd.read_csv(filename)

print subway_df.head()

def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    x_std_units = (x - x.mean())/x.std(ddof=0)
    y_std_units = (y - y.mean())/y.std(ddof=0)
    return (x_std_units * y_std_units).mean()

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print '\ncorrelation(entries, rain)'
print correlation(entries, rain)

print '\ncorrelation(entries, temp)'
print correlation(entries, temp)

print '\ncorrelation(rain, temp)'
print correlation(rain, temp)

print '\ncorrelation(entries, cum_entries)'
print correlation(entries, cum_entries)

   UNIT     DATEn     TIMEn  ENTRIESn   EXITSn  ENTRIESn_hourly  \
0  R003  05-01-11  00:00:00   4388333  2911002              0.0   
1  R003  05-01-11  04:00:00   4388333  2911002              0.0   
2  R003  05-01-11  12:00:00   4388333  2911002              0.0   
3  R003  05-01-11  16:00:00   4388333  2911002              0.0   
4  R003  05-01-11  20:00:00   4388333  2911002              0.0   

   EXITSn_hourly             datetime  hour  day_week     ...       pressurei  \
0            0.0  2011-05-01 00:00:00     0         6     ...           30.22   
1            0.0  2011-05-01 04:00:00     4         6     ...           30.25   
2            0.0  2011-05-01 12:00:00    12         6     ...           30.28   
3            0.0  2011-05-01 16:00:00    16         6     ...           30.26   
4            0.0  2011-05-01 20:00:00    20         6     ...           30.28   

  rain  tempi  wspdi meanprecipi  meanpressurei  meantempi  meanwspdi  \
0    0   55.9    3.5         0.0     

## DataFrame Vectorized Operations

In [37]:
# Examples of vectorized operations on DataFrames:
# Change False to True for each block of code to see what it does

print '\nAdding DataFrames with the column names'
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
    df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
    print df1 + df2
    
print '\nAdding DataFrames with overlapping column names'
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
    df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
    print df1 + df2

print '\nAdding DataFrames with overlapping row indexes'
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                       index=['row1', 'row2', 'row3'])
    df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                       index=['row4', 'row3', 'row2'])
    print df1 + df2

# --- Quiz ---
print '\nCumulative entries and exits for one station for a few hours'
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})

def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    print entries_and_exits
    print entries_and_exits.shift(1)
    return entries_and_exits - entries_and_exits.shift(1)
    #return entries_and_exits.diff()

print '\nget_hourly_entries_and_exits(entries_and_exits)'
print get_hourly_entries_and_exits(entries_and_exits)


Adding DataFrames with the column names
    a   b   c
0  11  44  77
1  22  55  88
2  33  66  99

Adding DataFrames with overlapping column names
    a   b   c   d
0 NaN  74  47 NaN
1 NaN  85  58 NaN
2 NaN  96  69 NaN

Adding DataFrames with overlapping row indexes
         a     b     c
row1   NaN   NaN   NaN
row2  32.0  65.0  98.0
row3  23.0  56.0  89.0
row4   NaN   NaN   NaN

Cumulative entries and exits for one station for a few hours

get_hourly_entries_and_exits(entries_and_exits)
   ENTRIESn   EXITSn
0   3144312  1088151
1   3144335  1088159
2   3144353  1088177
3   3144424  1088231
4   3144594  1088275
5   3144808  1088317
6   3144895  1088328
7   3144905  1088331
8   3144941  1088420
9   3145094  1088753
    ENTRIESn     EXITSn
0        NaN        NaN
1  3144312.0  1088151.0
2  3144335.0  1088159.0
3  3144353.0  1088177.0
4  3144424.0  1088231.0
5  3144594.0  1088275.0
6  3144808.0  1088317.0
7  3144895.0  1088328.0
8  3144905.0  1088331.0
9  3144941.0  1088420.0
   ENTRIESn  

## DataFrame applymap()

In [46]:
# Change False to True for this block of code to see what it does

print '\nDataFrame applymap()'
if True:
    df = pd.DataFrame({
        'a': [1, 2, 3],
        'b': [10, 20, 30],
        'c': [5, 10, 15]
    })
    
    def add_one(x):
        return x + 1
        
    print df.applymap(add_one)
    
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
    
def convert_grades(grades):
    '''
    Fill in this function to convert the given DataFrame of numerical
    grades to letter grades. Return a new DataFrame with the converted
    grade.
    
    The conversion rule is:
        90-100 -> A
        80-89  -> B
        70-79  -> C
        60-69  -> D
        0-59   -> F
    '''
    def numerical_to_letter_grades(grade):
        if grade <= 59:
            return 'F'
        elif grade <= 69:
            return 'D'
        elif grade <= 79:
            return 'C'
        elif grade <= 89:
            return 'B'
        elif grade <= 100:
            return 'A'
        else:
            return 'N/A'
        
    return grades.applymap(numerical_to_letter_grades)

print '\nconvert_grades(grades_df)'
print convert_grades(grades_df)


DataFrame applymap()
   a   b   c
0  2  11   6
1  3  21  11
2  4  31  16

convert_grades(grades_df)
        exam1 exam2
Andre       F     F
Barry       B     D
Chris       C     F
Dan         C     F
Emilio      B     D
Fred        C     F
Greta       A     C
Humbert     D     F
Ivan        A     C
James       B     D


## DataFrame apply()

In [62]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

# Change False to True for this block of code to see what it does

print '\nDataFrame apply()'
if True:
    def convert_grades_curve(exam_grades):
        # Pandas has a bult-in function that will perform this calculation
        # This will give the bottom 0% to 10% of students the grade 'F',
        # 10% to 20% the grade 'D', and so on. You can read more about
        # the qcut() function here:
        # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
        return pd.qcut(exam_grades,
                       [0, 0.1, 0.2, 0.5, 0.8, 1],
                       labels=['F', 'D', 'C', 'B', 'A'])
        
    # qcut() operates on a list, array, or Series. This is the
    # result of running the function on a single column of the
    # DataFrame.
    print "convert_grades_curve(grades_df['exam1'])"
    print convert_grades_curve(grades_df['exam1'])
    
    # qcut() does not work on DataFrames, but we can use apply()
    # to call the function on each column separately
    print 'grades_df.apply(convert_grades_curve)'
    print grades_df.apply(convert_grades_curve)
    
def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    '''
    def standardize_column(series):
        return (series - series.mean())/series.std(ddof = 0)
    
    return df.apply(standardize_column)


print '\nstandardize(grades_df)'
print type(standardize(grades_df))
print standardize(grades_df)


DataFrame apply()
convert_grades_curve(grades_df['exam1'])
Andre      F
Barry      B
Chris      C
Dan        C
Emilio     B
Fred       C
Greta      A
Humbert    D
Ivan       A
James      B
Name: exam1, dtype: category
Categories (5, object): [F < D < C < B < A]
grades_df.apply(convert_grades_curve)
        exam1 exam2
Andre       F     F
Barry       B     B
Chris       C     C
Dan         C     C
Emilio      B     B
Fred        C     C
Greta       A     A
Humbert     D     D
Ivan        A     A
James       B     B

standardize(grades_df)
<class 'pandas.core.frame.DataFrame'>
            exam1     exam2
Andre   -2.315341 -2.304599
Barry    0.220191  0.386400
Chris    0.020017 -0.096600
Dan     -0.180156 -0.096600
Emilio   0.753987  0.662400
Fred    -0.513779 -0.441600
Greta    0.887436  1.490400
Humbert -0.847401 -0.786600
Ivan     1.354508  1.007400
James    0.620538  0.179400


## DataFrame apply() Use Case 2

In [118]:
df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

# Change False to True for this block of code to see what it does

print '\nDataFrame apply() - use case 2'
if True:   
    print df.apply(np.mean)
    print df.apply(np.max)
    max_df = df['a'].loc[df['a'] == df['a'].max()]
    print df['a'].drop(max_df.index)
    
def second_largest(df):
    '''
    Fill in this function to return the second-largest value of each 
    column of the input DataFrame.
    '''
    def drop_second_largest_in_a_series(series):
        max_pos = series.loc[series == series.max()]
        return series.drop(max_pos.index)
        
    return df.apply(drop_second_largest_in_a_series).max()

print '\nsecond_largest(df) #Bug that drops max value'
print second_largest(df)

def second_largest(df):
    '''
    Fill in this function to return the second-largest value of each 
    column of the input DataFrame.
    '''
    def second_largest_in_a_series(series):
        return series.sort_values(ascending=False).iloc[1]
        #return series[series.sort_values(ascending=False, inplace=False).index[1]]
        
    return df.apply(second_largest_in_a_series)
    #return second_largest_in_a_series(df['a'])

print '\nsecond_largest(df)'
print second_largest(df)


DataFrame apply() - use case 2
a     3.0
b    30.0
c    15.0
dtype: float64
a     5
b    50
c    25
dtype: int64
0    4
2    3
3    1
4    2
Name: a, dtype: int64

second_largest(df) #Bug that drops max value
a     4.0
b    40.0
c    20.0
dtype: float64

second_largest(df)
a     4
b    40
c    20
dtype: int64
