# Developing Functions

This exercise will lead you through taking some common data processing steps and wrapping them up into a reusable function.

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path # handy for working with file paths, consistent across systems (windows, mac, unix)

In [None]:
pd.__version__ # check we're working with the same version of pandas

In [None]:
data_filepath = Path("../data/OxfordStreetAirQuality.csv")

In [None]:
output_folder = Path("../data/Processed")

In [None]:
# import a csv file
df = pd.read_csv(data_filepath)

In [None]:
# check what we have 
df.info()

In [None]:
# we have some rows where we don't have all data
df.count()

In [None]:
# drop rows with NaN
df.dropna(how='any', axis=0, inplace=True)

In [None]:
# set the data type for the reading datetime
df['ReadingDateTime'] = df['ReadingDateTime'].astype('datetime64[ns]')

In [None]:
# set the index to the reading time
df.set_index('ReadingDateTime', inplace=True, drop=True)

In [None]:
# pivot the table 
pivoted = pd.pivot_table(df ,index=['ReadingDateTime'], columns=['Species'], values='Value')

In [None]:
pivoted.head()

In [None]:
# calculate a weekly mean 
weekly_mean = pivoted.resample('W', label='left').mean()

In [None]:
weekly_mean.plot()

In [None]:
# add a data quality flag
weekly_mean['Hazardous'] = weekly_mean['NOX'] > 50.

In [None]:
# make a processed data folder
if not output_folder.exists():
    output_folder.mkdir(parents=True)

In [None]:
# output this to a 'processed files' folder
output_filepath = output_folder / 'WeeklyMeanAQ.csv'
weekly_mean.to_csv(output_filepath)

In [None]:
# Combine the data processing steps above into a reusable function.

def process_csv_file(filepath, output_folder = Path("../data/processed/"), fill_with=np.nan):
    """
    Process a csv file so it's ready for exploratory data analysis.
    
    Parameters
    -----------
    filepath
        Path to the csv file to import.
    output_folder 
        Path to the folder where you want the processed version to reside.
    fill_with 
        Value to substitute for zero.
        
    Returns
    --------
    output_filepath
        Path to the csv file which is output.
        
    Notes
    --------
    This function will convert data types and fill zeros with the specified value.
    """
    pass

Copy this file, and the libraries imported above into the separate file `processor.py`. 

Now when we want to use this function, we can import it:

In [None]:
from process_pipeline.processor import process_csv_file

If you want to check back to see what arguments the function takes, you can use the inline help:

In [None]:
help(process_csv_file)

In [None]:
process_csv_file(data_filepath, 
                 output_folder =Path("../data/another_processed_data_folder/"), 
                 fill_with=" ")