In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('wfp_market_food_prices.zip', encoding='ISO-8859-1')

In [3]:
# Rename columns to something sensible
df = df.rename(
    {
        'adm0_id': 'country_id',
        'adm0_name': 'country_name',
        'adm1_id': 'locality_id',
        'adm1_name': 'locality_name',
        'mkt_id': 'market_id',
        'mkt_name': 'market_name',
        'cm_id': 'commodity_id',
        'cm_name': 'commodity_name',
        'cur_id': 'currency_id',
        'cur_name': 'currency_name',
        'pt_id': 'market_type_id',
        'pt_name': 'market_type',
        'um_id': 'measurement_id',
        'um_name': 'unit_of_goods_measurement',
        'mp_month': 'month_recorded',
        'mp_year': 'year_recorded',
        'mp_commoditysource': 'source'
    },
    axis='columns')

useless_columns = ['country_id', 'locality_id', 'market_id', 'commodity_id',
                   'currency_id', 'market_type_id', 'measurement_id', 'source']
df = df.drop(useless_columns, axis=1)

In [4]:
# For a given country, locality, market and commodity, only keep the most recent recording of price
# Since the data is sorted to begin with, we only need to keep the last entry
df = df.groupby(['country_name', 'locality_name', 'market_name', 'commodity_name']).last().reset_index()
df = df.drop(['month_recorded', 'year_recorded'], axis=1)

In [5]:
# Remove all parentheses, which merely differentiate between the different kinds of food
# i.e. imported or not, method of cooking, etc.
df.commodity_name = df.commodity_name.apply(lambda s: re.sub('\(.*\)', '', s).strip())

In [6]:
# Remove things that are not foods
non_foods = ['Exchange rate', 'Fuel', 'Wage', 'Charcoal', 'Oil']
df = df[~df.loc[:, 'commodity_name'].isin(non_foods)]

In [7]:
# Renumber rows
df = df.reset_index(drop=True)

In [8]:
df.to_csv('processed_data.csv')