In [135]:
import pandas as pd
import numpy as np
import calendar

import os
import re

from datetime import datetime

In [30]:
# Read the excel file, excluding the metadata at the beginning, and totals at the end
# Also drop all rows that are NaN
sheet = pd.read_excel("data/september_2013_mold.xls", skiprows=4, skip_footer=1).dropna(axis=0, how='all')
sheet.head()

Unnamed: 0,Acrodictys,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32
0,Agrocybe,,,,,,,,,,...,,,,,,,,,,
1,Algae,,,20.0,8.0,,42.0,,,12.0,...,,36.0,146.0,,,,,16.0,,
2,Alternaria,,,14.0,42.0,28.0,16.0,,,54.0,...,24.0,52.0,170.0,87.0,,,,,,
3,Arthimium,,,,,,,,,,...,,,,,,,,,,
4,Ascomycetes,,,684.0,458.0,275.0,928.0,,,1676.0,...,3645.0,4246.0,2089.0,1792.0,,,,4970.0,,


In [3]:
from download_data import all_data
file_format = os.path.join("data", "{}_{}_{}.xls")
files = [file_format.format(m, y, t) for m, y, t in all_data if os.path.exists(file_format.format(m, y, t))]

In [131]:
pollen_sheets = [pd.read_excel(file, skiprows=4, skip_footer=1).dropna(axis=(0,1), how="all") for file in files if file.endswith("pollen.xls")]
pollen_sheets[0].head()

Unnamed: 0,DATE,Ash,Ashe Juniper / Bald Cypress,Elm,Pine,Tree Total,Grass Total,Tree & Grass Total,Weed Total,POLLEN TOTAL,Tech.
0,1,,,,,0,,0,0,0,NC
1,2,,6.0,,,6,,6,0,6,GG
2,3,,2.0,,,2,,2,0,2,GG
3,4,,,2.0,,2,,2,0,2,GG
4,5,,,,,0,,0,0,0,Weekend


In [136]:
# Load mold sheets, and transpose because the header is by row
mold_sheets = [pd.read_excel(file, skiprows=5, skip_footer=7).transpose().dropna(axis=(0,1), how="all") for file in files if file.endswith("mold.xls")]
def fix_header_and_drop(sheet):
    header = sheet.iloc[0]
    sheet.columns = header
    return sheet.drop(sheet.index[0])
mold_sheets = list(map(fix_header_and_drop, mold_sheets))
mold_sheets[0].head()

DATE,Acrodictys,Agrocybe,Algae,Alternaria,Arthimium,Ascomycetes,Asperisporium,Basidiomycetes,Beltrania,Botrytis,...,Pithomyces,Powdery Mildew,Pseudocercospora,Puccinia,Rust,Spegazinia,Stemphyllium,Tetrapola,Tilletia,Torula
2,,,10.0,24.0,,488,,223,,,...,6.0,,,,,,18,,,
3,,,,6.0,,136,,43,,,...,4.0,,,,,,4,,,
4,,,16.0,20.0,,142,,89,,,...,2.0,,,,,2.0,6,,,
7,,,,,,194,,130,,,...,,,,,,,6,,,
8,,,4.0,28.0,,701,,215,,,...,4.0,,,,,,24,,,


In [146]:
# Cleaning up and unifying column names
specific_pollen = {
    "Other Tree Pollen": "Other Tree",
    "Other Tree/Unidentified": "Other Tree",
    "Other Weed Pollen": "Other Weed",
    "Other Weed/Unidentified": "Other Weed"
}
def pollen_column_mapper(column_name):
    # This deletes parens and contents
    result = re.sub(r"\(.+\)", "", column_name)
    result = result.title()
    result = result.strip()
    try:
        result = specific_pollen[result]
        return result
    except KeyError:
        return result
set(np.concatenate([s.rename(columns=pollen_column_mapper).columns.values for s in pollen_sheets]))

{'Alnus',
 'Amaranth',
 'Ash',
 'Ashe Juniper / Bald Cypress',
 'Birch',
 'Black Gum',
 'Black Walnut',
 'Burweed / Marshelder',
 'Bushes',
 'Cattail',
 'Cedar',
 'Cotton Wood',
 'Date',
 'Dog Fennel',
 'Dogwood',
 'Elm',
 'Gingko Biloba',
 'Glandular Mesquite',
 'Grass Total',
 'Hackberry',
 'Hickory',
 "Lamb'S Quarters",
 'Magnolia',
 'Maple',
 'Mulberry',
 'Oak',
 'Osage Orange',
 'Other Tree',
 'Other Weed',
 'Partridge Pea',
 'Pigweed',
 'Pine',
 'Plantago',
 'Plum Grannet',
 'Pollen Total',
 'Privet',
 'Ragweed',
 'Rumex',
 'Sagebrush',
 'Saltbrush',
 'Sedge',
 'Sneezeweed',
 'Sweet Gum',
 'Sycamore',
 'Tech.',
 'Tree & Grass Total',
 'Tree Total',
 'Walnut',
 'Weed Total',
 'Wild Carrot',
 'Willow'}

In [113]:
def mold_column_mapper(column_name):
    return result
set(np.concatenate([s.rename(columns=mold_column_mapper).columns.values for s in mold_sheets]))

{'*D. conidia/hyphae',
 'Acrodictys',
 'Agrocybe',
 'Algae',
 'Alternaria',
 'Arthimium',
 'Ascomycetes',
 'Asperisporium',
 'Basidiomycetes',
 'Beltrania',
 'Botrytis',
 'Cercospora',
 'Cladosporium',
 'Curvularia',
 'Dendryphiella',
 'Dichotomophthora',
 'Diplococcum',
 'Drechslera/Helmintho.',
 'Epicoccum',
 'Fusariella',
 'Ganoderma',
 'Helicomina',
 'Microsporum',
 'Misc. Fungus (Hyaline)',
 'Monodictys',
 'Myxomycete/Smut',
 'Nigrospora',
 'Penicillium/Aspergillus',
 'Periconia',
 'Pestalotiopsis',
 'Pithomyces',
 'Pleospora',
 'Polythrincium',
 'Powdery Mildew',
 'Powdery Mildew(Oidium/Erysiphe)',
 'Pseudocercospora ',
 'Puccinia',
 'Rust',
 'Spegazinia',
 'Speggazinia',
 'Stemphyllium',
 'Tetrapola',
 'Tilletia',
 'Torula'}