In [7]:
import pandas as pd
import numpy as np
import os
from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file

In [8]:
# Filenames etc
out_dir = unicode("../output")
src = unicode("../src")
pathsep = os.path.sep

# Raw data
pop_file = os.path.join(src, "BE0101N1.xlsx") # Population

### Extract entities - municipalities

In [9]:
def extract_entities_municipalities(data):
    muni = data[["Unnamed: 0", "Unnamed: 1"]].copy() # Copy entity columns
    muni.rename(columns = {"Unnamed: 0": "municipality", "Unnamed: 1": "name"}, inplace=True) # Rename columns
    muni = muni.dropna() # Drop NaN rows
    muni["municipality"] = muni["municipality"].map(to_concept_id) # Make id string alphanumeric
    muni["county"] = muni["municipality"].map(lambda x: x[:2]) # Add county
    muni["is--municipality"] = "True"
    
    return muni

### Extract concepts

In [10]:
def extract_concepts(out_dir):
    concept_file = os.path.join(out_dir, "ddf--concepts.csv")
    
    concepts = ["Name","Population", "Year", "Municipality", "County"]
    df_con = pd.DataFrame([], columns = ["concept", "name", "concept_type"])

    df_con["name"] = concepts
    df_con["concept"] = df_con["name"].map(to_concept_id)

    df_con["concept_type"] = "measure"
    df_con["concept_type"].iloc[0] = "string"
    df_con["concept_type"].iloc[2] = "time"
    df_con["concept_type"].iloc[3] = "entity_domain"
    df_con["concept_type"].iloc[4] = "entity_domain"
    
    # If not exist, create new
    # Else, add to existing file
    if os.path.isfile(concept_file):
        print concept_file + " exists. Adding new entries."
        data = pd.read_csv(concept_file, encoding="utf-8")
        df_con = pd.concat([data, df_con])
        df_con = df_con.drop_duplicates(subset=["concept"])
        
    return df_con

### Extract datapoints

In [11]:
def extract_datapoints(data):
    measure = "population"
    
    # Extract entities
    entities = extract_entities_municipalities(data)
    
    # Extract measures
    pop = data[data.columns[2:]].copy()

    datapoints = pd.DataFrame([], columns=["municipality", "name", "year", measure])
    for year in pop.columns:
        tmp = entities[["municipality","name"]].copy()
        tmp["year"] = str(year)
        tmp[measure] = pop[year]
        tmp[measure]
        datapoints = pd.concat([datapoints,tmp])
        
    datapoints["municipality"] = datapoints["municipality"].map(to_concept_id)
    
    return datapoints.sort_values(by=["municipality","year"])

### Main script

In [12]:
if __name__ == "__main__":
    
    data = pd.read_excel(pop_file, skiprows=[0,1], parse_cols="A:AX", skip_footer=50, \
                         converters={'Unnamed: 0': lambda x: str(x)})
    
    # Extract entities
    muni = extract_entities_municipalities(data)
    path = os.path.join(out_dir, "ddf--entities--municipality.csv")
    print "Printing " + path
    muni.to_csv(path, index=False, encoding="utf-8")

    # Extract concepts
    concepts = extract_concepts(out_dir)
    path = os.path.join(out_dir, "ddf--concepts.csv")
    print "Printing " + path
    concepts.to_csv(path, index=False, encoding="utf-8")
            
    # Extract datapoints
    datapoints = extract_datapoints(data)
    path = os.path.join(out_dir, "ddf--datapoints--population--by--municipality--year.csv")
    print "Printing " + path
    datapoints.to_csv(path, index=False, encoding="utf-8")

    del data, muni, concepts, datapoints

Printing ../output/ddf--entities--municipality.csv
../output/ddf--concepts.csv exists. Adding new entries.
Printing ../output/ddf--concepts.csv
Printing ../output/ddf--datapoints--population--by--municipality--year.csv
