In [28]:
import pandas as pd
import numpy as np
import calendar

import os
import re
import string

from datetime import datetime, date

In [30]:
from download_data import all_data
file_format = os.path.join("data", "{}_{}_{}.xls")
files = [file_format.format(m, y, t) for m, y, t in all_data if os.path.exists(file_format.format(m, y, t))]

In [40]:
pollen_sheets = [pd.read_excel(file, skiprows=4, skip_footer=1).dropna(axis=(0,1), how="all").fillna(0) for file in files if file.endswith("pollen.xls")]
pollen_sheets[0].head()

Unnamed: 0,DATE,Ash,Ashe Juniper / Bald Cypress,Elm,Pine,Tree Total,Grass Total,Tree & Grass Total,Weed Total,POLLEN TOTAL,Tech.
0,1,0.0,0.0,0.0,0.0,0,0.0,0,0,0,NC
1,2,0.0,6.0,0.0,0.0,6,0.0,6,0,6,GG
2,3,0.0,2.0,0.0,0.0,2,0.0,2,0,2,GG
3,4,0.0,0.0,2.0,0.0,2,0.0,2,0,2,GG
4,5,0.0,0.0,0.0,0.0,0,0.0,0,0,0,Weekend


In [41]:
# Load mold sheets, and transpose because the header is by row
mold_sheets = [pd.read_excel(file, skiprows=5, skip_footer=7).transpose().dropna(axis=(0,1), how="all").fillna(0) for file in files if file.endswith("mold.xls")]

# we have to make the first row the columns because we transposed
def fix_header_and_drop(sheet):
    header = sheet.iloc[0]
    sheet.columns = header
    return sheet.drop(sheet.index[0])
mold_sheets = list(map(fix_header_and_drop, mold_sheets))
mold_sheets[0].head()

DATE,Acrodictys,Agrocybe,Algae,Alternaria,Arthimium,Ascomycetes,Asperisporium,Basidiomycetes,Beltrania,Botrytis,...,Pithomyces,Powdery Mildew,Pseudocercospora,Puccinia,Rust,Spegazinia,Stemphyllium,Tetrapola,Tilletia,Torula
2,0,0,10,24,0,488,0,223,0,0,...,6,0,0,0,0,0,18,0,0,0
3,0,0,0,6,0,136,0,43,0,0,...,4,0,0,0,0,0,4,0,0,0
4,0,0,16,20,0,142,0,89,0,0,...,2,0,0,0,0,2,6,0,0,0
7,0,0,0,0,0,194,0,130,0,0,...,0,0,0,0,0,0,6,0,0,0
8,0,0,4,28,0,701,0,215,0,0,...,4,0,0,0,0,0,24,0,0,0


In [33]:
# Cleaning up and unifying column names
unify_pollen = {
    "Other Tree Pollen": "Other Tree",
    "Other Tree/Unidentified": "Other Tree",
    "Other Weed Pollen": "Other Weed",
    "Other Weed/Unidentified": "Other Weed"
}
from name_conversions import common_to_scientific
def pollen_column_mapper(column_name):
    # This deletes parens and contents
    result = re.sub(r"\(.+\)", "", column_name)
    result = string.capwords(result)
    result = result.strip()
    result = unify_pollen.get(result, result)
    return result

for sheet in pollen_sheets:
    sheet.rename(columns=pollen_column_mapper, inplace=True)

set(np.concatenate([s.columns.values for s in pollen_sheets]))

{'Alnus',
 'Amaranth',
 'Ash',
 'Ashe Juniper / Bald Cypress',
 'Birch',
 'Black Gum',
 'Black Walnut',
 'Burweed / Marshelder',
 'Bushes',
 'Cattail',
 'Cedar',
 'Cotton Wood',
 'Date',
 'Dog Fennel',
 'Dogwood',
 'Elm',
 'Gingko Biloba',
 'Glandular Mesquite',
 'Grass Total',
 'Hackberry',
 'Hickory',
 "Lamb's Quarters",
 'Magnolia',
 'Maple',
 'Mulberry',
 'Oak',
 'Osage Orange',
 'Other Tree',
 'Other Tree/unidentified',
 'Other Weed',
 'Partridge Pea',
 'Pigweed',
 'Pine',
 'Plantago',
 'Plum Grannet',
 'Pollen Total',
 'Privet',
 'Ragweed',
 'Rumex',
 'Sagebrush',
 'Saltbrush',
 'Sedge',
 'Sneezeweed',
 'Sweet Gum',
 'Sycamore',
 'Tech.',
 'Tree & Grass Total',
 'Tree Total',
 'Walnut',
 'Weed Total',
 'Wild Carrot',
 'Willow'}

In [42]:
unify_mold = {
    "Misc. Fungus (Hyaline)": "Hyaline",
    "*D. Conidia/Hyphae": "Dematiaceous"
}
def mold_column_mapper(column_name):
    result = re.sub(r"\(.+\)", "", column_name)
    result = result.title()
    result = result.strip()
    result = unify_mold.get(result, result)
    return result
set(np.concatenate([s.rename(columns=mold_column_mapper).columns.values for s in mold_sheets]))

{'*D. Conidia/Hyphae',
 'Acrodictys',
 'Agrocybe',
 'Algae',
 'Alternaria',
 'Arthimium',
 'Ascomycetes',
 'Asperisporium',
 'Basidiomycetes',
 'Beltrania',
 'Botrytis',
 'Cercospora',
 'Cladosporium',
 'Curvularia',
 'Dendryphiella',
 'Dichotomophthora',
 'Diplococcum',
 'Drechslera/Helmintho.',
 'Epicoccum',
 'Fusariella',
 'Ganoderma',
 'Helicomina',
 'Microsporum',
 'Misc. Fungus',
 'Monodictys',
 'Myxomycete/Smut',
 'Nigrospora',
 'Penicillium/Aspergillus',
 'Periconia',
 'Pestalotiopsis',
 'Pithomyces',
 'Pleospora',
 'Polythrincium',
 'Powdery Mildew',
 'Pseudocercospora',
 'Puccinia',
 'Rust',
 'Spegazinia',
 'Speggazinia',
 'Stemphyllium',
 'Tetrapola',
 'Tilletia',
 'Torula'}

In [35]:
def drop_total_tech_entries(sheet):
    result = sheet.drop(columns=list(sheet.filter(regex="(Total|Tech)")))
    if any(result['Date'].astype(str) == "Total"):
        result = result[sheet['Date'].astype(str) != "Total"]
    if any(result['Date'].astype(str) == "TOTAL"):
        result = result[result['Date'].astype(str) != "TOTAL"]
    return result
pollen_sheets = list(map(drop_total_tech_entries, pollen_sheets))

In [36]:
month_name_to_number = {v.lower(): k for k,v in enumerate(calendar.month_name)}
def convert_dates(metadata, sheet):
    month = month_name_to_number[metadata[0]]
    year = metadata[1]
    num_days = calendar.monthrange(year, month)[1]
    sheet = sheet[sheet['Date'].astype(str).apply(lambda d: d.isdecimal())]
    sheet = sheet[sheet['Date'].astype(int) <= num_days]
    sheet['Date'] = sheet['Date'].apply(lambda day: date(year, month, int(day)))
    return sheet
pollen_sheets = list(map(lambda a: convert_dates(a[0], a[1]), zip([a for a in all_data if a[2] == "pollen"], pollen_sheets)))

In [37]:
[i for i, a in enumerate(pollen_sheets) if any(a['Date'].astype(str) == "sdsfe")]

[]

In [38]:
pollen_sheets[0]

Unnamed: 0,Date,Ash,Ashe Juniper / Bald Cypress,Elm,Pine
0,2013-01-01,,,,
1,2013-01-02,,6.0,,
2,2013-01-03,,2.0,,
3,2013-01-04,,,2.0,
4,2013-01-05,,,,
5,2013-01-06,,,,
6,2013-01-07,,,,
7,2013-01-08,,10.0,2.0,
8,2013-01-09,,,,
9,2013-01-10,,,,
