# Merge Data
Code in this notebook collects data from the "intermediate data" folder and merges it, producing a merged_data.csv and variables.csv. You will need to change path variables to be able to run it on your own machine. 

In [31]:
import pandas as pd
import os
import pycountry

### Check required columns

In [32]:
ADB_MEMBERS_DIR = 'data/adb-members.csv'
GENERIC_COLS = ['iso', 'year', 'data_source', 'country_name']
DATA_DIR = 'data/cleaned-datasets/inter'
for csv in [x for x in os.listdir(DATA_DIR) if x.endswith('.csv')]:
    # print(csv)
    csv_path = os.path.join(DATA_DIR, csv)
    df = pd.read_csv(csv_path)
    for col in GENERIC_COLS:
        print(csv, col + ':', col in df.columns)

wwgi_clean.csv iso: True
wwgi_clean.csv year: True
wwgi_clean.csv data_source: True
wwgi_clean.csv country_name: True
World Bank Climate Knowledge Portal.csv iso: True
World Bank Climate Knowledge Portal.csv year: True
World Bank Climate Knowledge Portal.csv data_source: True
World Bank Climate Knowledge Portal.csv country_name: True
IDMC_Conflict and Disaster Total.csv iso: True
IDMC_Conflict and Disaster Total.csv year: True
IDMC_Conflict and Disaster Total.csv data_source: True
IDMC_Conflict and Disaster Total.csv country_name: True
UN HDI, Environment Pillar.csv iso: True
UN HDI, Environment Pillar.csv year: True
UN HDI, Environment Pillar.csv data_source: True
UN HDI, Environment Pillar.csv country_name: True
epi.csv iso: True
epi.csv year: True
epi.csv data_source: True
epi.csv country_name: True
UNDESA_clean.csv iso: True
UNDESA_clean.csv year: True
UNDESA_clean.csv data_source: True
UNDESA_clean.csv country_name: True
V-Dem.csv iso: True
V-Dem.csv year: True
V-Dem.csv data_sour

### Get variables

In [33]:
def get_variables():
    variables = {
        'file_name': [],
        'data_source': [],
        'variable': [],
        'min_year': [],
        'max_year': [], 
    }
    for csv in [x for x in os.listdir(DATA_DIR) if x.endswith('.csv')]:
        csv_path = os.path.join(DATA_DIR, csv)
        df = pd.read_csv(csv_path)
        for col in df.columns:
            if col not in GENERIC_COLS:
                variables['file_name'].append(csv)
                variables['data_source'].append(df.at[0, 'data_source'])
                variables['variable'].append(col)
                variables['max_year'].append(df.dropna(axis=0, subset=[col]).year.max())
                variables['min_year'].append(df.dropna(axis=0, subset=[col]).year.min())

    var_df = pd.DataFrame(variables)
    return var_df

### Merge data

In [34]:
def get_country_name(iso):
    try:    
        return pycountry.countries.get(alpha_3=iso).name
    except AttributeError:
        pass

In [35]:
def merge_data():
    
    for i, csv in enumerate([x for x in os.listdir(DATA_DIR) if x.endswith('.csv')]):
        csv_path = os.path.join(DATA_DIR, csv)
        if i == 0:
            df = pd.read_csv(csv_path)
            df = df.drop(['country_name', 'data_source'], axis=1)
        else:
            merge_df = pd.read_csv(csv_path).drop(['country_name', 'data_source'], axis=1)
            df = df.merge(merge_df, how='outer', on=['iso', 'year'])
    df.dropna(0, subset=['iso', 'year'], inplace=True, how='any')
    country_names = df['iso'].apply(get_country_name)
    df.insert(1, 'country_name', country_names)
    adb = pd.read_csv(ADB_MEMBERS_DIR)
    adb = adb[adb.membership_type=='Regional']
    df = df[df['iso'].isin(adb.iso)]
    df = df[df['year'] >= 2010]
    df['year'] = df['year'].astype(int)
    df.reset_index(drop=True, inplace=True)

    return df

In [36]:
df = merge_data()
variables = get_variables()

In [37]:
df.to_csv('data/merged_data.csv', index=False)
variables.to_csv('data/variables.csv', index=False)