In [167]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sqlite3
import os
import re
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from gensim import corpora, models, similarities
output_notebook()
%matplotlib inline

import os
print(os.listdir("../input"))
conn = sqlite3.connect('../input/database.sqlite')
c = conn.cursor()

First, I want to take a look to see how many countries are in this dataset and also how many features are in this dataset.

In [168]:
DataStats = pd.read_sql(
                       """
                        SELECT
                            Series.[Number of Indicies]
                            ,Country.[Number of Countries]
                        FROM (
                            SELECT 1 [idx]
                                ,count(*) [Number of Indicies]
                                ,NULL [Number of Countries]
                            FROM   Series  
                        ) Series
                        INNER JOIN (
                            SELECT 1 [idx]
                                ,NULL [Number of Indicies]
                                ,count(*) [Number of Countries]
                            FROM Country
                        ) Country
                            on Series.idx=Country.idx
                       """, con=conn)
print(DataStats)

## Data Integrity Checks
Next we'll check for data integrity.  By grouping on CountryCode and Year, we can determine the count of how many metrics are collected per year for each country.  This will help us narrow down how a histogram distribution of how complete our data is.  I included a filter of 1990 as reviewing the SeriesNotes database, quite a lot of metrics before then was interpolated using data

### Metrics Per Year
First breakdown is number of metrics per year.  This will allow us to see the distribution of the number of metrics for every year and country that was gathered.  This will help us define how consistent the data collection process was that World Bank undertook.  From here, we can see a pretty clear peak around 550 metrics so finding the year where those metrics are collected might be the best set to do time series analysis on.  We also see a large peak close to zero as some metrics may be collected on a 5 year interval while others are collected on a 1 year interval.  This probably explains the significant amount of missing data.

In [169]:
metricsPerYear = pd.read_sql(
                       """
                        SELECT CountryName
                            ,count(IndicatorCode) [metricsPerYear]
                        FROM   Indicators
                        GROUP BY CountryCode, Year                  
                       """, con=conn)
metricsPerYear.hist(column='metricsPerYear', bins=50)

### Metrics per country
Next we'll look at how many metrics are collected per country.  This will help us define how many metrics to use to analyze a country.  Since the SQL query has ordered the country by count of metrics, we can just print the head or tail of the dataframe to determine which countries will have the most complete data data and which countries will have the least.

In [170]:
metricsPerCountry = pd.read_sql(
                       """
                        SELECT CountryName
                            ,count(distinct(IndicatorCode)) [metricsPerCountry]
                        FROM   Indicators
                        GROUP BY CountryCode                  
                        ORDER BY [metricsPerCountry] desc
                       """, con=conn)
metricsPerCountry.hist(column='metricsPerCountry', bins=50)

In [171]:
print(metricsPerCountry.head(20))

In [172]:
print(metricsPerCountry.tail(20))

### Testing out the data
I wanted to peak a little into the data to see what is provided in here so I ran a little plot of time series data on a specific metric and country.  This provided me with good results, showing that there's enough to plot and is easy enough to play around with.

In [173]:
PlayAround = pd.read_sql(
                       """
                       
                        SELECT Ind.CountryName
                            ,Ind.Year
                            ,Ind.Value
                            ,Ser.IndicatorName
                        FROM   Indicators Ind
                        INNER JOIN Series Ser
                            on Ser.SeriesCode=Ind.IndicatorCode
                        WHERE Ind.IndicatorCode = 'SM.POP.NETM'
                        and Ind.CountryCode = 'NAC'
                        
                       """, con=conn)
print(PlayAround.head(5))
plt.plot(PlayAround['Value'])

### Indicator Categorization Mapping
Utilizing code from Krishna Ravikumar's notebook for [Choosing Topics To Explore...](https://www.kaggle.com/kmravikumar/choosing-topics-to-explore).  I can get a feel for the various types of metrics that will be collected from this dataset based on titles and create a mapping table using Stored Procedures so that when users select a topic to explore, they can be provided with a list of metrics associated with that topic.  This could also be easy to use as a machine learning tool to drill down into related indicators.

In [174]:

IndicatorSQLResults = pd.read_sql(
                       """
                        SELECT IndicatorName
                            ,IndicatorCode
                        FROM   Indicators
                       """, con=conn)
Indicator_array =  IndicatorSQLResults[['IndicatorName','IndicatorCode']].drop_duplicates().values

modified_indicators = []
unique_indicator_codes = []
for ele in Indicator_array:
    indicator = ele[0]
    indicator_code = ele[1].strip()
    if indicator_code not in unique_indicator_codes:
        # delete , ( ) from the IndicatorNames
        new_indicator = re.sub('[,()]',"",indicator).lower()
        # replace - with "to" and make all words into lower case
        new_indicator = re.sub('-'," to ",new_indicator).lower()
        modified_indicators.append([new_indicator,indicator_code])
        unique_indicator_codes.append(indicator_code)

Indicators = pd.DataFrame(modified_indicators,columns=['IndicatorName','IndicatorCode'])
Indicators = Indicators.drop_duplicates()

key_word_dict = {}
key_word_dict['Demography'] = ['population','birth','death','fertility','mortality','expectancy']
key_word_dict['Food'] = ['food','grain','nutrition','calories']
key_word_dict['Trade'] = ['trade','import','export','good','shipping','shipment']
key_word_dict['Health'] = ['health','desease','hospital','mortality','doctor']
key_word_dict['Economy'] = ['income','gdp','gni','deficit','budget','market','stock','bond','infrastructure']
key_word_dict['Energy'] = ['fuel','energy','power','emission','electric','electricity']
key_word_dict['Education'] = ['education','literacy']
key_word_dict['Employment'] =['employed','employment','umemployed','unemployment']
key_word_dict['Rural'] = ['rural','village']
key_word_dict['Urban'] = ['urban','city']

In [175]:
feature = 'Food'
for indicator_ele in Indicators.values:
    for ele in key_word_dict[feature]:
        word_list = indicator_ele[0].split()
        if ele in word_list or ele+'s' in word_list:
            print(indicator_ele)
            break

### Latent Semantic Analysis of Metrics
Krishna Ravikumar's notebook is really helpful in getting the initial analysis started.  However, I wanted to drill down further into each category to see if we can get more metric concepts related to a category without direct key word searches.  To do so, I will need to train the set of metrics using LSI and Cosine similarity to define document similarity to a specific search term.

In [176]:
# set a list of stop words to remove from the corpus, can always add to this list manually
stoplist = set('for a of the and to in [ on from per'.split())

#break down each line to a comma delimited list of words
texts = [[word for word in str(document).lower().split() if word not in stoplist] 
         for document in Indicators.values]

#generate a word frequency count
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

#filters out words that only show up once in the set of corpuses
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

# pretty-printer
from pprint import pprint  
pprint(texts[:5])



From filtering down by finding any words that only appear once, we can also identify the most frequently used words to see if any of them does not belong and add them to our stop-word list.

In [177]:
import operator
sorted_X = sorted(frequency.items(), key=operator.itemgetter(1),reverse=True)
sorted_X[:20]

Now we can build out the dictionary that will be used for LSI and see which dictionary value is associated with each key

In [178]:
dictionary = corpora.Dictionary(texts)
print(dictionary)

In [179]:
#use this to check the id of specific words in the dictionary
dictionary.token2id['working']

Printing out the indexing, we can compare it side by side with the document to see the dictionary value being stored into this matrix.

In [180]:
corpus = [dictionary.doc2bow(text) for text in texts]
pprint(corpus[:10])

In [181]:
pprint(Indicators.values[:10])

Now we start the LSI modling of our data.  First we'll generate the model off of our corpus.

In [182]:
from gensim.test.utils import common_dictionary
from gensim.models import LsiModel

model = LsiModel(corpus,id2word=dictionary)
vectorized_corpus = model[corpus]

Next we test this on a specific key word to see how well it performs.

In [206]:
#key_word_dict['Demography'] = ['population','birth','death','fertility','mortality','expectancy']
#key_word_dict['Food'] = ['food','grain','nutrition','calories']
#key_word_dict['Trade'] = ['trade','import','export','good','shipping','shipment']
#key_word_dict['Health'] = ['health','desease','hospital','mortality','doctor']
#key_word_dict['Economy'] = ['income','gdp','gni','deficit','budget','market','stock','bond','infrastructure']
#key_word_dict['Energy'] = ['fuel','energy','power','emission','electric','electricity']
#key_word_dict['Education'] = ['education','literacy']
#key_word_dict['Employment'] =['employed','employment','umemployed','unemployment']
#key_word_dict['Rural'] = ['rural','village']
#key_word_dict['Urban'] = ['urban','city']

doc = "Trade"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi=model[vec_bow]
index = similarities.MatrixSimilarity(model[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
pprint(sims[:10])

In [207]:
ct=0
for val in sims:
    if val[1] > 0.1:
        pprint(Indicators.values[val[0]][0])
        ct=ct+1
        if ct > 30:
            break