# Database Load

In [1]:
from flask import Flask, jsonify
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.dialects.postgresql import JSON
from flask import render_template
import csv
import json
import glob
import psycopg2

app = Flask(__name__)

app.config['DEBUG'] = True
DB_URL = 'postgresql+psycopg2://{user}:{pw}@{url}/{db}'.format(user='postgres',pw='********',url='localhost',db='country_test')
'postgresql+psycopg2://postgres:Virginis1212@localhost/country_test'
app.config['SQLALCHEMY_DATABASE_URI'] = DB_URL
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

db = SQLAlchemy(app)

In [2]:
class BaseModel(db.Model):
    """Base data model for all objects"""
    __abstract__ = True

    def __repr__(self):
        """Define a base way to print models"""
        return '%s(%s)' % (self.__class__.__name__, {
            column: value
            for column, value in self._to_dict().items()
        })


class Countries(BaseModel, db.Model):
    """Model for the countries table"""
    __tablename__ = 'countries'

    id = db.Column(db.Integer, primary_key = True)
    iso_a3 = db.Column(db.String())
    country_name = db.Column(db.String())
    indicator_code = db.Column(db.String())
    year = db.Column(db.Integer)
    value = db.Column(db.String())
    
    def __init__(self, id, iso_a3, country_name, indicator_code, year, value):
        self.id = id
        self.iso_a3 = iso_a3
        self.country_name = country_name
        self.indicator_code = indicator_code
        self.year = year
        self.value = value

In [3]:
db.drop_all()
db.create_all()

In [2]:
# Get all .csv files in ../Output_Data directory
csvlist = [f for f in glob.glob("../Output_data/*.csv")]
csvlist

['../Output_data\\AccessToElectricity.csv',
 '../Output_data\\Births attended by skilled health staff.csv',
 '../Output_data\\Children out of school, primary, female.csv',
 '../Output_data\\corruption_final_version.csv',
 '../Output_data\\country_codes.csv',
 '../Output_data\\Crude birth rate.csv',
 '../Output_data\\Crude Death rate.csv',
 '../Output_data\\Current health expenditure (% of GDP).csv',
 '../Output_data\\Current health expenditure per capita, PPP (current international $).csv',
 '../Output_data\\econ_freedom.csv',
 '../Output_data\\Fertility rate, total (births per woman).csv',
 '../Output_data\\Fixed broadband subscriptions (per 100 people).csv',
 '../Output_data\\healthcare_coverage.csv',
 '../Output_data\\Hospital beds (per 1,000 people).csv',
 '../Output_data\\human-rights-scores.csv',
 '../Output_data\\InfantMortalityRate.csv',
 '../Output_data\\LifeExpectancy.csv',
 '../Output_data\\LifeExpectancyMale.csv',
 '../Output_data\\LiteracyAdultFemale.csv',
 '../Output_data

In [3]:
alldata = []
for csvfile in csvlist:
    with open(csvfile, encoding='utf-8-sig') as f:
        reader = csv.reader(f)
        newdata = [row for row in reader]
        alldata.append(newdata)
len(alldata)

24

In [4]:
alldata[2][0][0]

'Country Name'

In [5]:
def get_grid_type(grid):
    if 'Year' in grid[0]:
        return 'row_by_year'
    else:
        for year_to_check in range(1960, 2999):
            if str(year_to_check) in grid[0]:
                return 'col_by_year'
    return 'undetermined'

In [6]:
def find_key_cols_for_by_year(grid):
    started_flag = False
    country_name_ix = -1
    country_code_ix = -1
    indicator_code_ix = -1
    first_year_ix = -1
    last_year_ix = -1
    num_cols = len(grid[0])
    for col_ix in range(num_cols):
        if grid[0][col_ix] == 'Country_Name':
            country_name_ix = col_ix
        elif grid[0][col_ix] == 'Country Name':
            country_name_ix = col_ix
        elif grid[0][col_ix] == 'Country_Code':
            country_code_ix = col_ix
        elif grid[0][col_ix] == 'Country Code':
            country_code_ix = col_ix
        elif grid[0][col_ix] == 'Indicator_Code':
            indicator_code_ix = col_ix
        elif grid[0][col_ix] == 'Indicator Code':
            indicator_code_ix = col_ix
        else:
            try:
                year_col = int(grid[0][col_ix])
            except ValueError:
                pass
            else:
                if (year_col > 1960) and (year_col < 2999):
                    if started_flag:
                        last_year_ix = col_ix
                    else:
                        first_year_ix = col_ix
                        last_year_ix = col_ix
                        started_flag = True
    return country_name_ix, country_code_ix, indicator_code_ix, first_year_ix, last_year_ix

In [7]:
def find_key_cols_for_by_row(grid):
    country_name_ix = -1
    country_code_ix = -1
    indicator_code_ix = -1
    year_ix = -1
    num_cols = len(grid[0])
    for col_ix in range(num_cols):
        if grid[0][col_ix] == 'Country_Name':
            country_name_ix = col_ix
        elif grid[0][col_ix] == 'Country Name':
            country_name_ix = col_ix
        elif grid[0][col_ix] == 'Country_Code':
            country_code_ix = col_ix
        elif grid[0][col_ix] == 'Country Code':
            country_code_ix = col_ix
        elif grid[0][col_ix] == 'Indicator_Code':
            indicator_code_ix = col_ix
        elif grid[0][col_ix] == 'Indicator Code':
            indicator_code_ix = col_ix
        elif grid[0][col_ix] == 'Year':
            year_ix = col_ix
    return country_name_ix, country_code_ix, indicator_code_ix, year_ix

In [21]:
def process_by_year(grid, global_id):
    added_count = 0
    data_to_add = set()
    cname_col, ccode_col, icode_col, firstyr_col, lastyr_col = find_key_cols_for_by_year(grid)
    if (cname_col < 0) or (ccode_col < 0) or (icode_col < 0) or (firstyr_col < 0) or (lastyr_col < 0):
        print('Unable to parse csv data')
        return set()
    at_header = True
    for row in grid:
        if at_header:
            at_header = False
            continue
        else:
            ccode = row[ccode_col]
            cname = row[cname_col]
            icode = row[icode_col]
            for year_ix in range(firstyr_col, lastyr_col + 1):
                iyear = int(grid[0][year_ix])
                ivalue = row[year_ix]
                if ivalue:
                    data_to_add.add(icode)
#                    db.session.add(data_to_add)
#                    db.session.commit
                    global_id += 1
                    added_count += 1
    return data_to_add

In [22]:
def process_by_row(grid, global_id):
    added_count = 0
    data_to_add = set()
    cname_col, ccode_col, icode_col, yr_col = find_key_cols_for_by_row(grid)
    if (cname_col < 0) or (ccode_col < 0) or (icode_col < 0) or (yr_col < 0):
        print('Unable to parse csv data')
        return set()
    at_header = True
    for row in grid:
        if at_header:
            at_header = False
            continue
        else:
            ccode = row[ccode_col]
            cname = row[cname_col]
            icode = row[icode_col]
            iyear = int(row[yr_col])
            ivalue = row[yr_col + 1]
            if ivalue:
                data_to_add.add(icode)
#                db.session.add(data_to_add)
#               db.session.commit
                global_id += 1
                added_count += 1
    return data_to_add

In [23]:
global_id = 0
icodes = set()
num_added = 0
for two_d_grid in alldata:
    grid_type = get_grid_type(two_d_grid)
    if grid_type == 'col_by_year':
        dataset = process_by_year(two_d_grid, global_id)
        icodes = icodes|dataset
        print('Grid with multiple years processed: added ' + str(num_added))
        global_id += num_added
    elif grid_type == 'row_by_year':
        dataset = process_by_row(two_d_grid, global_id)
        icodes = icodes|dataset
        print('Grid with one year per row processed: added ' + str(num_added))
        global_id += num_added

Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Unable to parse csv data
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Unable to parse csv data
Grid with multiple years processed: added 0
Unable to parse csv data
Grid with one year per row processed: added 0
Grid with multiple years processed: added 0
Unable to parse csv data
Grid with one year per row processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Unable to parse csv data
Grid with one year per row processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0
Grid with multiple years processed: added 0


In [19]:
test_result = Countries.query.filter_by(indicator_code ='SH.STA.BRTC.ZS').order_by(Countries.iso_a3).all()

In [20]:
len(test_result)

2548

In [24]:
icodes

{'EG.ELC.ACCS.ZS',
 'SE.ADT.1524.LT.FE.ZS',
 'SE.ADT.LITR.FE.ZS',
 'SE.PRM.PRSL.FE.ZS',
 'SE.PRM.PRSL.MA.ZS',
 'SE.PRM.UNER.FE',
 'SH.MED.BEDS.ZS',
 'SH.STA.BRTC.ZS',
 'SH.XPD.CHEX.PP.CD',
 'SM.POP.NETM',
 'SP.DYN.CBRT.IN',
 'SP.DYN.CDRT.IN',
 'SP.DYN.LE00.IN',
 'SP.DYN.LE00.MA.IN'}