In [39]:
from utils.styles import *

# Define the steps your programme needs to take

### Data Loading & Cleaning
1. Open a CSV file for reading
2. Read through each line of the file
3. Split the line based on the comma delimiter
4. Clean the values by casting them to int
### Split-Apply-Combine
5. Split : Grouping the data together based on aggregate keys
6. Apply : Calculate the metrics based on every grouping
7. Combine : Combine the aggregate key and the calculated metrics 
### Write to CSV
8. Format the data as a CSV
9. Write it to a file

In [34]:
from collections import defaultdict
 
    
def data_loading_and_cleaning(path, header=True):
    with open(path) as fn:               # (1) Open the CSV for reading
        lines = []                       # Store each clean line here 
        for line in fn.readlines():      # (2) Iterate through the lines
            if header:                   # Check if we want to skip the header
                header = False           # Previous check will fail next iteration
                continue                 # Continue the loop, effectively skipping next line
            lines.append([int(i) for i in line.strip().split(',')])
                                         # (3,4) Clean the lines and cast to int
    return lines                         # Return cleaned lines
                
    
def split_apply_combine(data, header):
    grouped = defaultdict(list)          # (5) Store each record per grouping
    for r in data:                       # (5) Iterate through the records
        key = (r[0],r[1],r[4])           # (5) Identify the aggregate key
        record = (r[2],r[3])             # (5) Select the Impression & Clicks 
        grouped[key].append(record)      # (5) Append the record to the right group    
    
    c = ""                               # (8) Store the CSV content as a string
    n_cols = len(header)                 # (8) Identify the width of the CSV
    for key, rs in grouped.iteritems():  # (6) Iterate through the groupings
        m = get_metrics(rs)              # (6) Calculate the aggregate metrics 
        c += csv_line(list(key),m,n_cols)      # (7) Combine key & values into a CSV line
    return c                             # (8) Return CSV content


def get_metrics(records):
    """return the summary for a key"""
    clicks = []                         # (6) Store all clicks for this grouping
    imps = []                           # (6) Store all impressions for this grouping

    for record in records:              # (6) Iterate through the records
        clicks.append(record[0])        # (6) Split off the clicks  
        imps.append(record[1])          # (6) Split off the impresssions 

    avg_click = sum(clicks)/len(clicks) # (6) Calculate metrics
    avg_imps = sum(imps) / len(imps)    # (6) Calculate metrics
    max_click = max(clicks)             # (6) Calculate metrics
    max_imps = max(imps)                # (6) Calculate metrics

    return [avg_click, avg_imps, max_click, max_imps]
                                        # (8) Return metrics in a list

def csv_line(k,v,n_cols):                
    t = '{},'*(n_cols-1)+'{}\n'          # (8) Generate CSV line with placeholders
    return t.format(*k + v)              # (8) Insert the key and values into template

    # The above uses some advanced methods.
    # To understand these techniques read:
    
    # * https://pyformat.info/
    # * http://stackoverflow.com/a/17278762
    # * http://python-reference.readthedocs.io/en/latest/docs/operators/tuple_unpack.html
    
    # However, functionally it's the same as:

    key_string = ','.join([str(k) for k in keys])
    metrics_string = ','.join([str(v) for v in values])
    csv_line = key_string + ',' + metrics_string
    return csv_line
    
def write_csv(path, content, header=False):
    with open(path, 'wb') as fn:       # (9) Open the CSV for writing
        if header:                     # (9) Check whether to write a header
            h = ",".join(header)+'\n'  # (9) Combine list into a string
            fn.write(h)                # (9) Write a header if necessary
        fn.write(content)              # (9) Write a CSV content
    
    
    
header = ["age", "gender", "signed_in", "avg_click", "avg_impressions",
          "max_click", "max_impressions"]# (8) Header for our output CSV
        
# String it all together 1-9

data = data_loading_and_cleaning('data/nytimes.csv')
content = split_apply_combine(data, header)
write_csv('nyt_agg.csv',content, header)

In [35]:
!head nyt_agg.csv

age,gender,signed_in,avg_click,avg_impressions,max_click,max_impressions
74,1,1,4,0,14,3
67,1,1,4,0,12,2
60,0,1,4,0,13,2
51,0,1,5,0,14,2
44,1,1,5,0,15,2
37,1,1,5,0,20,2
30,0,1,5,0,14,2
21,0,1,5,0,14,2
14,1,1,5,0,15,3
