# Data Prepare and Exploring
## Download Data
Before running this, download 

http://s3.kiva.org/snapshots/kiva_ds_csv.zip

to Dwonloads subdir of your home directory. Then extract the zip file in the same diretory.

This script should work for both Windows and Mac OS, as long as you follow the above instructions.
# Create sqlite
create db/kiva_data.sqlite

In [3]:
import numpy as np
import pandas as pd
from os import path
from pathlib import Path
import sqlalchemy as sa
from datetime import datetime
from sqlalchemy import create_engine

engine = create_engine("sqlite:///db/kiva_data.sqlite")

HOME_DIR = str(Path.home())

KIVA_PATH = path.join(HOME_DIR, 'Downloads', 'kiva_ds_csv')

## Load lenders into DB

In [5]:
lender_df = pd.read_csv(path.join(KIVA_PATH, 'lenders.csv'))
lender_df.columns = map(str.lower, lender_df.columns)

# member_since is UTC time, change to standard format
lender_df.loc[:, 'member_since'] = lender_df['member_since'].map(lambda x: datetime.utcfromtimestamp(x))

# Assign numeric ID to lender to reduce space needed for loan_lender table
lender_df = lender_df.assign(lender_id=np.arange(len(lender_df)))
lender_df = lender_df[['lender_id', 'permanent_name', 'display_name', 'city', 'state', 'country_code', 'member_since']]
lender_df.to_sql('lender', engine, if_exists='replace')

## Load loans into DB

In [6]:
loan_df = pd.read_csv(path.join(KIVA_PATH, 'loans.csv'))

loan_df.columns = map(str.lower, loan_df.columns)

## Convert gender to be F(femal), M(male) or U (unknown)
def map_gender(x):
    if (isinstance(x, str)): 
        return 'F' if x.startswith('female') else 'M'
    else:
        return 'U'
    
loan_df.loc[:, 'gender'] = loan_df['borrower_genders'].map(map_gender)    

# Write only needed column to DB as number of rows is large
loan_df = loan_df[['loan_id', 'loan_name', 'loan_amount', 'gender', 'country_code', 'posted_time']]
loan_df.to_sql('loan', engine,  if_exists='replace')

## Load loan lender links to DB

In [7]:
loan_lender_df = pd.read_csv(path.join(KIVA_PATH, 'loans_lenders.csv'))
loan_lender_df.columns = map(str.lower, loan_lender_df.columns)

# break lenders into lender_list
loan_lender_df['lender_list'] = loan_lender_df['lenders'].map(
    lambda x: x.split(', ')
)

# Build a quick lookup from lender name to ID
lender_id_lookup = {
    x['permanent_name']: x['lender_id']
    for i, x in lender_df.iterrows()
}

# Convert lender names to lender ID's
loan_lender_df['lender_id_list'] = loan_lender_df['lender_list'].map(
    lambda lenders: [lender_id_lookup.get(lender, 0) for lender in lenders]
)

# Create a new data frame holding loan_id and lender_id
df = loan_lender_df[['loan_id', 'lender_id_list']]

# Create loan_id_list same len as lender_id_list
df.loc[:, 'loan_id_list'] = df.apply(lambda x: [x[0]] * len(x[1]), axis=1, reduce=True)

# Ref: https://stackoverflow.com/questions/35004945/python-pandas-reduce-function-for-series
from itertools import chain # Based on stackoverflow, chain is 68x faster than reduce

combo_df = pd.DataFrame( {
    'lender_id': list(chain(*df['lender_id_list'])),
    'loan_id': list(chain(*df['loan_id_list'])),
})

# Write to database table
combo_df.to_sql('loan_lender', engine, if_exists='replace')