<h2>Preliminary analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype

apply the datetime operations

Stats: Range, median, average, Q1, Q3, histograms, density plot, percent missing data, total size of dataset

(For each year 2000-2017)

In [None]:
def find_xlim(series):
    if is_numeric_dtype(series):
        return series.median()+3*series.std() 
    else:
        return max(series)

In [None]:
def get_higho3_days(df):
    high_fiveminutes = df[df['o3'] > 40]
    fiveminutes_daymonths = high_fiveminutes[['day', 'month']]
    high_o3_index = df[['day', 'month']].isin(fiveminutes_daymonths)
    high_o3_days = df.iloc[high_o3[high_o3['day'] == True].index]
    return high_o3_days

In [None]:
def analyze(df_dict, year, output_path):
    year = str(year)
    for df_name in df_dict: 
        df = df_dict[df_name]
        descr_stats = df.describe(include='all')
        #documentation here: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html
        #descr_stats.to_csv('output_path+output_name')
        #descr_stats.to_csv('D:/programming-no-gdrive/DASH/Air Pollution/descriptive-output/data00_descr_stats.csv')
        descr_stats.to_csv(output_path+year+'-'+df_name+'-'+'descr_stats.csv')
        hists = df.hist(figsize=(72,72))
        #default is 10 bins for each plot
        #plt.savefig("D:/programming-no-gdrive/DASH/Air Pollution/descriptive-output/data00_hist.png")
        plt.savefig(output_path+year+'-'+df_name+'-'+"hist.png")
        #densities = df.plot.density(figsize=(72,72))
        #plt.savefig("D:/programming-no-gdrive/DASH/Air Pollution/descriptive-output/data00_density.png")
        #To get amount of missing values in each column (do for each df)
        percent_missing = df.isna().mean().round(4) * 100
        percent_missing.to_csv(output_path+year+'-'+df_name+'-'+"percent_missing.csv")
        #size of dataset
        with open(output_path+df_name, "a") as file:
            file.write(("Number of rows in dataset "+df_name+"-"year+":"+str(len(df))))
        cleaned = df.dropna(axis = 1, how='all').select_dtypes(['number'])
        for col in cleaned:
            fig, ax = plt.subplots()
            try:
                cleaned[col].plot.kde(ax=ax, legend=False, title=col)
            except BaseException:
                pass
            cleaned[col].plot.hist(density=True, ax=ax)
            plt.xlim(left = 0, right =find_xlim(cleaned[col]))
            plt.savefig(output_path+year+'-'+df_name+'-'+col+"-density.png")

<h2>Main

In [None]:
@plac.annotations(
    input_path=("Path containing the data files to ingest", "option", "p", str),
    input_prefix=("{$prefix}year.csv", "option", "P", str),
    input_suffix=("year{$suffix}.csv", "option", "S", str),
    output_path=("Path to write the resulting numpy sequences / transform cache", "option", "o", str),
    year_begin=("First year to process", "option", "b", int),
    year_end=("Year to stop with", "option", "e", int),
    aqsnumerical=("Convert AQS code to numerical", "flag", "A"),
    houston=("Only run for Houston sites", "flag", "H"),
    chunksize=("Process this many records at one time", "option", 'C', int)
)
def main(input_path: str = '/project/lindner/air-pollution/level3_data/',
         input_prefix: str = "Data_",
         input_suffix: str = "",
         output_path: str = '/project/lindner/air-pollution/current/2019/descriptive-output/',
         year_begin: int = 2000,
         year_end: int = 2018,
         aqsnumerical: bool = False,
         houston: bool = False,
         chunksize: int = 200000):
    
    all_years = []
    #for now just one file, later all years
    for year in range(year_begin,year_end):
        all_years.append(pd.read_csv(input_path+input_prefix+str(year)+".csv"))
    for index, df in enumerate(all_years):
        #Time series
        df['hour'] = pd.to_datetime(df['epoch'], unit='s').dt.hour
        df['day'] = pd.to_datetime(df['epoch'], unit='s').dt.day
        df['month'] = pd.to_datetime(df['epoch'], unit='s').dt.month
        daytime = df[(df['hour'] > 6) & (df['hour'] < 20)]
        nighttime = df[(df['hour'] < 7) | (df['hour'] > 20)]
        highpol_months = df[(df['month'] >3) & (df['month'] < 11)]
        higho3_days = get_higho3_days(df)
        subsets = {"df":df, "daytime":daytime, "nighttime":nighttime, "highpol_months":highpol_months, "higho3_days":higho3_days}
        analyze(subsets, index, output_path)

<h2>Site-wise analysis</h2>

To get site-wise analysis, run the same code on each generated site data file (each will contain all years). 

In [89]:
def transform(df: pd.DataFrame, year: int, fillgps: bool = False, naninvalid: bool = False, dropnan: bool = False, masknan: float = None, fillnan: float = None, aqsnumerical: bool = False, sites = []) -> pd.DataFrame:

    if len(sites) > 0:
        #drop all sites other than the one requested
        df.drop(df[~df['AQS_Code'].isin(sites)].index, inplace=True)

NameError: name 'dfdfd' is not defined