# MapReduce
Chien-Lan Hsueh

## Task 1: Split data

In [1]:
# import modules used in this assignment
import os
import functools
from collections import Counter
import numpy as np
import pandas as pd

# read in the data
df_raw = pd.read_csv("scoresFull.csv")

# look at the first couple rows of the data
display(df_raw.head())

Unnamed: 0,week,date,day,season,awayTeam,AQ1,AQ2,AQ3,AQ4,AOT,...,homeFumLost,homeNumPen,homePenYds,home3rdConv,home3rdAtt,home4thConv,home4thAtt,homeTOP,HminusAScore,homeSpread
0,1,5-Sep,Thu,2002,San Francisco 49ers,3,0,7,6,-1,...,0,10,80,4,8,0,1,32.47,-3,-4.0
1,1,8-Sep,Sun,2002,Minnesota Vikings,3,17,0,3,-1,...,1,4,33,2,6,0,0,28.48,4,4.5
2,1,8-Sep,Sun,2002,New Orleans Saints,6,7,7,0,6,...,0,8,85,1,6,0,1,31.48,-6,6.0
3,1,8-Sep,Sun,2002,New York Jets,0,17,3,11,6,...,1,10,82,4,8,2,2,39.13,-6,-3.0
4,1,8-Sep,Sun,2002,Arizona Cardinals,10,3,3,7,-1,...,0,7,56,6,10,1,2,34.4,8,6.0


To make our codes reusable, we will first define a function to split a data frame based on the value of a specified column and save them in separate data files.

In [2]:
# function to split data
def split_into_batches(df_raw, by, batch_dir = "batches"):
    """
    Split and save data into separate files by specified column
    # df_raw: data frame to be splitted
    # by: column name
    # batch_dir: folder to be created for saved splitted data
    """
    
    # split the raw data frame by value of the specified column and save the values and datasets into a dictionary
    dct_df = {str(_): df_raw[df_raw[by] == _] for _ in df_raw[by].unique()}
    
    # create a folder for splitted data files
    try:
        os.makedirs(batch_dir)    
        print(f"Directory {batch_dir} is successfully created.")
    except FileExistsError:
        print(f"Directory {batch_dir} already exists!")  
    
    # loop through each data set and save it separately
    for _, _df in dct_df.items():
        # creat path and file name for each splitted data
        path = os.path.join(batch_dir, str(_) + ".csv")
        
        # save in separeted csv file
        try:
            _df.to_csv(path, index=False)
            print(f"File {path} is created.")
        except OSError:
            print(f"Cannot create CSV file {path}!")
    
    # return the splitted data        
    return dct_df

It is straightforward to use the defined function to split the data:

In [3]:
dct_batches = split_into_batches(df_raw, by = "season")

Directory batches is successfully created.
File batches\2002.csv is created.
File batches\2003.csv is created.
File batches\2004.csv is created.
File batches\2005.csv is created.
File batches\2006.csv is created.
File batches\2007.csv is created.
File batches\2008.csv is created.
File batches\2009.csv is created.
File batches\2010.csv is created.
File batches\2011.csv is created.
File batches\2012.csv is created.
File batches\2013.csv is created.
File batches\2014.csv is created.


Similarly, we can define a function to read all splitted data files in a folder:

In [4]:
# function to read separated data files
def read_batches(batch_dir = "batches", fileExt = "csv"):
    """
    Read in batches of data in a specified folder
    # batch_dir: folder to read in
    # fileExt: file type (extension)
    """
    
    # select how to read in file based on file type
    match fileExt.lower():
        case "csv":
            fun = pd.read_csv
        case "xsl"|"xlsx":
            fun = pd.read_excel
        case _:
            raise Exception("Not supported file type!")

    # read in data from each file and save them in a dictionary with the file name as the dictionary key
    try:
        dct_df = {os.path.splitext(_)[0]: fun(os.path.join(batch_dir,_)) for _ in os.listdir(batch_dir) if _.lower().endswith(fileExt.lower())}
    except OSError:
        print(f"Cannot read in files!")
    
    # return the read data sets
    return dct_df

Read in all separated data files in a specific folder and save them into a dictionary of data frames with the file names as the dictionary keys.

In [5]:
# read all separated data files in the folder
dct_batches_read = read_batches()

In [6]:
# verify read-in data has the same dictionary keys
print(f"Identical keys: {dct_batches.keys() == dct_batches_read.keys()}")

# print all of the dictionary keys
print(dct_batches_read.keys())

# print one set from each dictionary to compare and verify that they are the same
display(dct_batches["2002"])
display(dct_batches_read["2002"])

Identical keys: True
dict_keys(['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014'])


Unnamed: 0,week,date,day,season,awayTeam,AQ1,AQ2,AQ3,AQ4,AOT,...,homeFumLost,homeNumPen,homePenYds,home3rdConv,home3rdAtt,home4thConv,home4thAtt,homeTOP,HminusAScore,homeSpread
0,1,5-Sep,Thu,2002,San Francisco 49ers,3,0,7,6,-1,...,0,10,80,4,8,0,1,32.47,-3,-4.0
1,1,8-Sep,Sun,2002,Minnesota Vikings,3,17,0,3,-1,...,1,4,33,2,6,0,0,28.48,4,4.5
2,1,8-Sep,Sun,2002,New Orleans Saints,6,7,7,0,6,...,0,8,85,1,6,0,1,31.48,-6,6.0
3,1,8-Sep,Sun,2002,New York Jets,0,17,3,11,6,...,1,10,82,4,8,2,2,39.13,-6,-3.0
4,1,8-Sep,Sun,2002,Arizona Cardinals,10,3,3,7,-1,...,0,7,56,6,10,1,2,34.40,8,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,Division,12-Jan,Sun,2002,New York Jets,3,7,0,0,-1,...,0,8,70,2,6,1,1,30.93,20,5.5
263,Division,12-Jan,Sun,2002,San Francisco 49ers,3,3,0,0,-1,...,1,10,100,8,10,0,0,36.77,25,6.0
264,ConfChamp,19-Jan,Sun,2002,Tennessee Titans,7,10,7,0,-1,...,1,14,127,1,4,0,0,26.42,17,9.0
265,ConfChamp,19-Jan,Sun,2002,Tampa Bay Buccaneers,10,7,3,7,-1,...,2,5,45,2,7,1,2,29.12,-17,4.0


Unnamed: 0,week,date,day,season,awayTeam,AQ1,AQ2,AQ3,AQ4,AOT,...,homeFumLost,homeNumPen,homePenYds,home3rdConv,home3rdAtt,home4thConv,home4thAtt,homeTOP,HminusAScore,homeSpread
0,1,5-Sep,Thu,2002,San Francisco 49ers,3,0,7,6,-1,...,0,10,80,4,8,0,1,32.47,-3,-4.0
1,1,8-Sep,Sun,2002,Minnesota Vikings,3,17,0,3,-1,...,1,4,33,2,6,0,0,28.48,4,4.5
2,1,8-Sep,Sun,2002,New Orleans Saints,6,7,7,0,6,...,0,8,85,1,6,0,1,31.48,-6,6.0
3,1,8-Sep,Sun,2002,New York Jets,0,17,3,11,6,...,1,10,82,4,8,2,2,39.13,-6,-3.0
4,1,8-Sep,Sun,2002,Arizona Cardinals,10,3,3,7,-1,...,0,7,56,6,10,1,2,34.40,8,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,Division,12-Jan,Sun,2002,New York Jets,3,7,0,0,-1,...,0,8,70,2,6,1,1,30.93,20,5.5
263,Division,12-Jan,Sun,2002,San Francisco 49ers,3,3,0,0,-1,...,1,10,100,8,10,0,0,36.77,25,6.0
264,ConfChamp,19-Jan,Sun,2002,Tennessee Titans,7,10,7,0,-1,...,1,14,127,1,4,0,0,26.42,17,9.0
265,ConfChamp,19-Jan,Sun,2002,Tampa Bay Buccaneers,10,7,3,7,-1,...,2,5,45,2,7,1,2,29.12,-17,4.0


## Task 2: MapReduce

In this session, we will use MapReduce to aggregate a target variable by group. In the lecture, this is demonstrated by using a mapping function, a reducer function and `functools.reduce()`. The result, the word counts, is saved in a list of dictionary. In this homework, we are interested in more than one aggregation calculation including:
- sum of the target variable across the grouping variable
- sum of the squared values of the target variable across the grouping variable
- count or number of observations of the target variable in each group

Therefore, we will save the results in a list of a nested dictionary:
- a list: by group
- a dictionary of a dictionary with `sum`, `sum of squared` and `count` of the target variable of each level

We will start with a list of data frames we read in (Task 1):

In [7]:
# get the list of data frames as required in the homework instruction
lst_df = list(dct_batches_read.values())

# print the first data frame - this should be the same with the previous printed data frame at the end of task 1
display(lst_df[0])

Unnamed: 0,week,date,day,season,awayTeam,AQ1,AQ2,AQ3,AQ4,AOT,...,homeFumLost,homeNumPen,homePenYds,home3rdConv,home3rdAtt,home4thConv,home4thAtt,homeTOP,HminusAScore,homeSpread
0,1,5-Sep,Thu,2002,San Francisco 49ers,3,0,7,6,-1,...,0,10,80,4,8,0,1,32.47,-3,-4.0
1,1,8-Sep,Sun,2002,Minnesota Vikings,3,17,0,3,-1,...,1,4,33,2,6,0,0,28.48,4,4.5
2,1,8-Sep,Sun,2002,New Orleans Saints,6,7,7,0,6,...,0,8,85,1,6,0,1,31.48,-6,6.0
3,1,8-Sep,Sun,2002,New York Jets,0,17,3,11,6,...,1,10,82,4,8,2,2,39.13,-6,-3.0
4,1,8-Sep,Sun,2002,Arizona Cardinals,10,3,3,7,-1,...,0,7,56,6,10,1,2,34.40,8,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,Division,12-Jan,Sun,2002,New York Jets,3,7,0,0,-1,...,0,8,70,2,6,1,1,30.93,20,5.5
263,Division,12-Jan,Sun,2002,San Francisco 49ers,3,3,0,0,-1,...,1,10,100,8,10,0,0,36.77,25,6.0
264,ConfChamp,19-Jan,Sun,2002,Tennessee Titans,7,10,7,0,-1,...,1,14,127,1,4,0,0,26.42,17,9.0
265,ConfChamp,19-Jan,Sun,2002,Tampa Bay Buccaneers,10,7,3,7,-1,...,2,5,45,2,7,1,2,29.12,-17,4.0


### Mapping Funtion
Next, we will define a mapping function for a target variable's `sum`, `sum of square` and `count` by group.

In [8]:
# mapping function for sum, sum of squared and count by group
def map_sum(df, group, variable):
    """
    mappting function for sum, sum of squared and count of target variable by group
    df: data frame
    group: grouped variable
    variable: target variable
    """
    # group the data
    # aggregate for sum, sum of squared and count
    # convert to dictionary with index (variable) as keys
    return df.groupby(group)[variable].agg([("sum", 'sum'), ("sum of squared", lambda x: (x*x).sum()), ("count", 'count')]).to_dict("index")
    
# test of map_sum function
map_sum(lst_df[0], "week", "AQ1")

{'1': {'sum': 81, 'sum of squared': 695, 'count': 16},
 '10': {'sum': 78, 'sum of squared': 686, 'count': 14},
 '11': {'sum': 52, 'sum of squared': 450, 'count': 16},
 '12': {'sum': 68, 'sum of squared': 666, 'count': 16},
 '13': {'sum': 56, 'sum of squared': 534, 'count': 16},
 '14': {'sum': 88, 'sum of squared': 908, 'count': 16},
 '15': {'sum': 50, 'sum of squared': 388, 'count': 16},
 '16': {'sum': 92, 'sum of squared': 906, 'count': 16},
 '17': {'sum': 51, 'sum of squared': 419, 'count': 16},
 '2': {'sum': 67, 'sum of squared': 695, 'count': 16},
 '3': {'sum': 44, 'sum of squared': 314, 'count': 14},
 '4': {'sum': 69, 'sum of squared': 459, 'count': 14},
 '5': {'sum': 30, 'sum of squared': 272, 'count': 14},
 '6': {'sum': 70, 'sum of squared': 708, 'count': 14},
 '7': {'sum': 23, 'sum of squared': 125, 'count': 14},
 '8': {'sum': 34, 'sum of squared': 312, 'count': 14},
 '9': {'sum': 71, 'sum of squared': 661, 'count': 14},
 'ConfChamp': {'sum': 17, 'sum of squared': 149, 'count':

### Map through data sets

In [9]:
# length of iterable data sets
n_lst = len(lst_df)

# group by
group = "week"

# target variable
variable = "AQ1"

# map through
lst_mapped = list(map(map_sum, lst_df, [group]*n_lst, [variable]*n_lst))

# print the first one to check  - this should be the same with the previous result in the test of map_sum function
lst_mapped[0]

{'1': {'sum': 81, 'sum of squared': 695, 'count': 16},
 '10': {'sum': 78, 'sum of squared': 686, 'count': 14},
 '11': {'sum': 52, 'sum of squared': 450, 'count': 16},
 '12': {'sum': 68, 'sum of squared': 666, 'count': 16},
 '13': {'sum': 56, 'sum of squared': 534, 'count': 16},
 '14': {'sum': 88, 'sum of squared': 908, 'count': 16},
 '15': {'sum': 50, 'sum of squared': 388, 'count': 16},
 '16': {'sum': 92, 'sum of squared': 906, 'count': 16},
 '17': {'sum': 51, 'sum of squared': 419, 'count': 16},
 '2': {'sum': 67, 'sum of squared': 695, 'count': 16},
 '3': {'sum': 44, 'sum of squared': 314, 'count': 14},
 '4': {'sum': 69, 'sum of squared': 459, 'count': 14},
 '5': {'sum': 30, 'sum of squared': 272, 'count': 14},
 '6': {'sum': 70, 'sum of squared': 708, 'count': 14},
 '7': {'sum': 23, 'sum of squared': 125, 'count': 14},
 '8': {'sum': 34, 'sum of squared': 312, 'count': 14},
 '9': {'sum': 71, 'sum of squared': 661, 'count': 14},
 'ConfChamp': {'sum': 17, 'sum of squared': 149, 'count':

In [10]:
# print the second one - this will be used to check the reducer function later
lst_mapped[1]

{'1': {'sum': 42, 'sum of squared': 252, 'count': 16},
 '10': {'sum': 51, 'sum of squared': 419, 'count': 14},
 '11': {'sum': 37, 'sum of squared': 265, 'count': 16},
 '12': {'sum': 40, 'sum of squared': 446, 'count': 16},
 '13': {'sum': 58, 'sum of squared': 412, 'count': 16},
 '14': {'sum': 53, 'sum of squared': 397, 'count': 16},
 '15': {'sum': 43, 'sum of squared': 365, 'count': 16},
 '16': {'sum': 78, 'sum of squared': 808, 'count': 16},
 '17': {'sum': 37, 'sum of squared': 405, 'count': 16},
 '2': {'sum': 85, 'sum of squared': 913, 'count': 16},
 '3': {'sum': 36, 'sum of squared': 218, 'count': 14},
 '4': {'sum': 68, 'sum of squared': 750, 'count': 14},
 '5': {'sum': 43, 'sum of squared': 319, 'count': 14},
 '6': {'sum': 39, 'sum of squared': 243, 'count': 14},
 '7': {'sum': 81, 'sum of squared': 793, 'count': 14},
 '8': {'sum': 65, 'sum of squared': 505, 'count': 14},
 '9': {'sum': 46, 'sum of squared': 328, 'count': 14},
 'ConfChamp': {'sum': 0, 'sum of squared': 0, 'count': 2}

### Reducer function
We can now define a reducer function by updating the one given in the lecture note. The major change is to combine the dictionaries which are now nested dictionaries.

In [11]:
# reducer function to combine results - lecture note method
def sum_reduce(dict1, dict2):
    """
    reduce two dictionaries by combining them with additions
    """
    # initiate an empty dictionary for combination
    combined = {}
    
    # for each entry in dict1, make a copy in combined dictionary
    for key in dict1.keys():
        combined[key] = dict1[key].copy()
        # if this entry also exists in dict2, add the contents
        if key in dict2:
            # addition of the nested dictonaries
            for subkey in dict1[key].keys():
                combined[key][subkey] = dict1[key][subkey] + dict2[key][subkey]        
    
    # copy the leftover entry in dict2 to combined dictionary
    for key in dict2.keys():
        if key not in dict1.keys():
            combined[key] = dict2[key].copy()
    
    return combined

# test the sum_reduce function
dct1 = sum_reduce(lst_mapped[0], lst_mapped[1])
dct1

{'1': {'sum': 123, 'sum of squared': 947, 'count': 32},
 '10': {'sum': 129, 'sum of squared': 1105, 'count': 28},
 '11': {'sum': 89, 'sum of squared': 715, 'count': 32},
 '12': {'sum': 108, 'sum of squared': 1112, 'count': 32},
 '13': {'sum': 114, 'sum of squared': 946, 'count': 32},
 '14': {'sum': 141, 'sum of squared': 1305, 'count': 32},
 '15': {'sum': 93, 'sum of squared': 753, 'count': 32},
 '16': {'sum': 170, 'sum of squared': 1714, 'count': 32},
 '17': {'sum': 88, 'sum of squared': 824, 'count': 32},
 '2': {'sum': 152, 'sum of squared': 1608, 'count': 32},
 '3': {'sum': 80, 'sum of squared': 532, 'count': 28},
 '4': {'sum': 137, 'sum of squared': 1209, 'count': 28},
 '5': {'sum': 73, 'sum of squared': 591, 'count': 28},
 '6': {'sum': 109, 'sum of squared': 951, 'count': 28},
 '7': {'sum': 104, 'sum of squared': 918, 'count': 28},
 '8': {'sum': 99, 'sum of squared': 817, 'count': 28},
 '9': {'sum': 117, 'sum of squared': 989, 'count': 28},
 'ConfChamp': {'sum': 17, 'sum of square

The reducer function `sum_reduce()` combine the two dictionaries `lst_mapped[0]` and `lst_mapped[1]` correctly.

We can actually improve the way to define the reducer function by taking advantages of using `collections.Counter` objects. With `Counter` objects, we can easily perform addition of the nested dictionaries. This will make our codes more robust and easier to read and maintain.

In [12]:
# reducer function to combine results - using collections.Counter objects
def sum_reduce2(dict1, dict2):
    """
    reduce two dictionaries by combining them with additions
    """    
    # use a generator to combine Counter objects of two dictionaries and recast the result back to a dictionary
    return {_: dict(Counter(dict1.get(_)) + Counter(dict2.get(_))) for _ in sorted(dict1.keys() | dict2.keys())}

# test the new sum_reduce function
dct2 = sum_reduce2(lst_mapped[0], lst_mapped[1])
dct2

{'1': {'sum': 123, 'sum of squared': 947, 'count': 32},
 '10': {'sum': 129, 'sum of squared': 1105, 'count': 28},
 '11': {'sum': 89, 'sum of squared': 715, 'count': 32},
 '12': {'sum': 108, 'sum of squared': 1112, 'count': 32},
 '13': {'sum': 114, 'sum of squared': 946, 'count': 32},
 '14': {'sum': 141, 'sum of squared': 1305, 'count': 32},
 '15': {'sum': 93, 'sum of squared': 753, 'count': 32},
 '16': {'sum': 170, 'sum of squared': 1714, 'count': 32},
 '17': {'sum': 88, 'sum of squared': 824, 'count': 32},
 '2': {'sum': 152, 'sum of squared': 1608, 'count': 32},
 '3': {'sum': 80, 'sum of squared': 532, 'count': 28},
 '4': {'sum': 137, 'sum of squared': 1209, 'count': 28},
 '5': {'sum': 73, 'sum of squared': 591, 'count': 28},
 '6': {'sum': 109, 'sum of squared': 951, 'count': 28},
 '7': {'sum': 104, 'sum of squared': 918, 'count': 28},
 '8': {'sum': 99, 'sum of squared': 817, 'count': 28},
 '9': {'sum': 117, 'sum of squared': 989, 'count': 28},
 'ConfChamp': {'sum': 17, 'sum of square

The combined result `dct2` (using `sum_reduce2()`) matches `dct1` (using `sum_reduce()`).

### Reduce the results
Run the reducer function across all the data sets to get the sume, sum of squared and count of the variable "AQ1" at each level of grouping variable "week".

In [13]:
# apply the reducer function repetitively over the iterable list of data sets, and returns the result
df_sum = pd.DataFrame.from_dict(functools.reduce(sum_reduce2, lst_mapped), orient='index')
df_sum

Unnamed: 0,sum,sum of squared,count
1,713,5825,208
10,852,7568,186
11,798,7242,201
12,749,6529,205
13,872,8356,208
14,798,7258,208
15,814,7628,208
16,852,7210,208
17,775,7171,208
2,731,6207,207


## Task 3: Last bit
Before doing the aggregation, let's define a helper function to calculate sample standard deviation and a mapreduce function to process a list of data sets.

In [14]:
# helper function to calculate sample standard deviation
def std(mean, sum_of_squared, count):
    """
    calculate sample standard deviation
    # mean: mean of sample
    # sum_of_squared: sum of squared values
    # count: count of sample
    """
    # only return standard deviation when count is large than 1
    if count > 1:
        return np.sqrt(1/(count - 1) * (sum_of_squared - count*mean*mean))   

In [15]:
# MapReduce a list of dataset for mean and standard deviation
def MapReduce(lst_df, group, variable):
    """
    MapReduce a list of data sets for mean and standard deviation of target variable by group
    # lst_df: a list of data frames
    # group:  group by
    # variable: target variable
    """
    # length of iterable data sets
    n_lst = len(lst_df) 
    
    # map through
    lst_mapped = list(map(map_sum, lst_df, [group]*n_lst, [variable]*n_lst))
    
    # reduce    
    dct = functools.reduce(sum_reduce2, lst_mapped)
    
    # convert to dataframe
    df = pd.DataFrame.from_dict(dct, orient = "index").reset_index()   
    df.rename(columns={"index": group}, inplace=True)
    
    # calculate mean and standard deviation
    df["mean"] = df["sum"] / df["count"]    
    df["std"] = pd.Series(map(std, df["mean"], df["sum of squared"], df["count"]))
    
    return df

### Example 1: Mean and standard deviation of "AQ1" by "week"

In [16]:
df = MapReduce(lst_df, group = "week", variable = "AQ1")
df

Unnamed: 0,week,sum,sum of squared,count,mean,std
0,1,713,5825,208,3.427885,4.041403
1,10,852,7568,186,4.580645,4.45111
2,11,798,7242,201,3.970149,4.513214
3,12,749,6529,205,3.653659,4.311641
4,13,872,8356,208,4.192308,4.765165
5,14,798,7258,208,3.836538,4.502518
6,15,814,7628,208,3.913462,4.63261
7,16,852,7210,208,4.096154,4.239267
8,17,775,7171,208,3.725962,4.548918
9,2,731,6207,207,3.531401,4.195204


### Example 2: Mean and standard deviation of "homeRushYds" by "day"

In [17]:
df = MapReduce(lst_df, group = "day", variable = "homeRushYds")
df

Unnamed: 0,day,sum,sum of squared,count,mean,std
0,Fri,407,59121,3,135.666667,44.185216
1,Mon,25907,3748661,221,117.226244,56.876289
2,Sat,13965,2190935,113,123.584071,64.440134
3,Sun,359476,50941774,3014,119.268746,51.745254
4,Thu,13334,1833812,118,113.0,52.872205
5,Tue,107,11449,1,107.0,
6,Wed,82,6724,1,82.0,


In this example, there are two rows with only one count `count = 1` and their standard deviation `std` is not defined.

### Example 3: Mean and standard deviation of "homeRushYds" by "surface"

In [18]:
df = MapReduce(lst_df, group = "surface", variable = "awayRushYds")
df

Unnamed: 0,surface,sum,sum of squared,count,mean,std
0,a_turf,3473,527231,28,124.035714,59.769608
1,astroplay,11786,1789540,96,122.770833,60.049362
2,astroplay,2159,363291,16,134.9375,69.263236
3,astroturf,11664,1631980,99,117.818182,51.284394
4,dessograss,5124,520248,61,84.0,38.693669
5,fieldturf,73097,9797769,656,111.428354,50.231383
6,fieldturf,37934,4934476,351,108.074074,48.837751
7,grass,135235,18357273,1217,111.121611,52.328526
8,grass,82994,11164840,742,111.851752,50.39407
9,matrixturf,4752,651724,42,113.142857,52.746319


### Example 4: Mean and standard deviation of "HFinal" by "homeTeam"

In [19]:
df = MapReduce(lst_df, group = "homeTeam", variable = "HFinal")
df

Unnamed: 0,homeTeam,sum,sum of squared,count,mean,std
0,Arizona Cardinals,2414,63556,108,22.351852,9.471368
1,Atlanta Falcons,2624,74882,108,24.296296,10.198277
2,Baltimore Ravens,2659,75457,109,24.394495,9.903253
3,Buffalo Bills,2198,56348,104,21.134615,9.800988
4,Carolina Panthers,2181,54223,109,20.009174,9.899023
5,Chicago Bears,2484,67188,110,22.581818,10.088946
6,Cincinnati Bengals,2431,67589,107,22.719626,10.797269
7,Cleveland Browns,1863,45051,104,17.913462,10.648042
8,Dallas Cowboys,2647,74933,107,24.738318,9.442314
9,Denver Broncos,2879,86817,112,25.705357,10.743226
