In [44]:
import pandas as pd
import numpy as np
import os
from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file

### Settings

In [37]:
# Filenames etc
out_dir = os.path.join(os.pardir,"output")
src = os.path.join(os.pardir, "src")

# Raw data
lex_file = os.path.join(src, "000000NH.xlsx") # Life expectancy

### Helpers

In [40]:
def calculate_average_years(columns):
    average_years = []
    for col in columns:
        interval = col.split("-")
        r = range(int(interval[0]),int(interval[1])+1)
        average_years.append(r[len(r)/2])

    return average_years

In [41]:
def calculate_average_lex(data):
    df = data.copy()
    avg = pd.DataFrame([], columns=data.columns[2:])
    values = df[df.columns[2:]]
    values = values.replace("..", 0)
    for i in range(len(data.index)):
        if i % 2 == 0:
            female = values.iloc[i]
            male = values.iloc[i+1]
            tmp = pd.Series((female+male)/2)
            avg = avg.append(tmp, ignore_index=True)
    return avg

In [1]:
def checkDir(directory):
    if not os.path.exists(directory):
        print directory + " did not exist. Creating it..."
        os.makedirs(directory)
    else:
        print directory + " already exists. No further action."
    return

### Extract entities - municipalities

In [38]:
def extract_entities_municipalities(data):
    muni = data[["Unnamed: 0", "Unnamed: 1"]].copy() # Copy entity columns
    muni.rename(columns = {"Unnamed: 0": "municipality", "Unnamed: 1": "name"}, inplace=True) # Rename columns
    muni = muni.dropna() # Drop NaN rows
    muni["municipality"] = muni["municipality"].astype(unicode).map(to_concept_id) # Make id string alphanumeric
    muni["county"] = muni["municipality"].map(lambda x: x[:2]) # Add county
    muni["is--municipality"] = "True"
    
    return muni

### Extract concepts

In [39]:
def extract_concepts(out_dir):
    concept_file = os.path.join(out_dir, "ddf--concepts.csv")
    
    concepts = ["Name","Life expectancy", "Year", "Municipality", "County"]
    df_con = pd.DataFrame([], columns = ["concept", "name", "concept_type"])

    df_con["name"] = concepts
    df_con["concept"] = df_con["name"].astype(unicode).map(to_concept_id)

    df_con["concept_type"] = "measure"
    df_con["concept_type"].iloc[0] = "string"
    df_con["concept_type"].iloc[2] = "time"
    df_con["concept_type"].iloc[3] = "entity_domain"
    df_con["concept_type"].iloc[4] = "entity_domain"
    
    # If not exist, create new
    # Else, add to existing file
    if os.path.isfile(concept_file):
        print concept_file + " exists. Adding new entries."
        data = pd.read_csv(concept_file, encoding="utf-8")
        df_con = pd.concat([data, df_con])
        df_con = df_con.drop_duplicates(subset=["concept"])
        
    return df_con

### Extract datapoints

In [42]:
def extract_datapoints(data):
    measure = "life_expectancy"
    
    # Extract entities
    entities = extract_entities_municipalities(data)
    
    # Calculate average lex for females and males
    lex = calculate_average_lex(data)
    
    # Change from year interval to average year (e.g. 1998-2002 -> 2000)
    lex.columns = calculate_average_years(data.columns[2:])

    datapoints = pd.DataFrame([], columns=["municipality", "name", "year", measure])
    
    for year in lex.columns:
        tmp = entities.reset_index().copy() #TODO: Remove this hack (wrong concat because of index)
        tmp["year"] = str(year)
        tmp[measure] = lex[year]
        tmp[measure]
        datapoints = pd.concat([datapoints,tmp])
        
    datapoints["municipality"] = datapoints["municipality"].astype(unicode).map(to_concept_id)
    
    # Two lines below will go when above hack is fixed
    del datapoints["index"]
    datapoints = datapoints[["municipality","name","year",measure]]
    
    return datapoints.sort_values(by=["municipality","year"])

### Main script

In [43]:
if __name__ == "__main__":

    data = pd.read_excel(lex_file, skiprows=[0,1], parse_cols="A,B,E:R", skip_footer=76, \
                         converters={'Unnamed: 0': lambda x: str(x)})

    #Check it output dir exists, otherwise create it
    checkDir(out_dir)
    
    muni = extract_entities_municipalities(data)
    path = os.path.join(out_dir, "ddf--entities--municipality.csv")
    print "Printing " + path
    muni.to_csv(path, index=False, encoding="utf-8")

    # Extract concepts
    concepts = extract_concepts(out_dir)
    path = os.path.join(out_dir, "ddf--concepts.csv")
    print "Printing " + path
    concepts.to_csv(path, index=False, encoding="utf-8")
            
    # Extract datapoints
    datapoints = extract_datapoints(data)
    path = os.path.join(out_dir, "ddf--datapoints--life_expectancy--by--municipality--year.csv")
    print "Printing " + path
    datapoints.to_csv(path, index=False, encoding="utf-8")
    
    # Create index file
    print("Creating index files...")
    create_index_file(out_dir)

    del data, muni, concepts, datapoints

Printing ../output/ddf--entities--municipality.csv
../output/ddf--concepts.csv exists. Adding new entries.
Printing ../output/ddf--concepts.csv
Printing ../output/ddf--datapoints--life_expectancy--by--municipality--year.csv
