# Retrieve from XML Database and Count Key Words

In [13]:
import pyodbc
import numpy as np
import pandas as pd

SERVER_NAME = 'TSTWEOSQL,5876'
DATABASE_NAME = 'IMF_EPUBS'

QUERY_STRING = "SELECT dbo.PUBLICATION.*, dbo.DOCUMENT.* \
                FROM [dbo].[PUBLICATION] \
                INNER JOIN dbo.DOCUMENT ON dbo.PUBLICATION.SeriesNumber = dbo.DOCUMENT.SeriesNumber \
                WHERE dbo.PUBLICATION.ImfCategoryDesc = 'IMF Staff Country Reports' \
                AND dbo.PUBLICATION.ProjectedYear in(2018)\
                AND dbo.PUBLICATION.Description like '%Article IV%'" 

searchList = ['FinTech','TechFin','digital','technology','Blockchain','innovation',
              'distributed ledger','Bitcoin','ICO','cryptocurrency','mobile','online','cyber','decentralization',
              'international payment','correspondent bank','financial inclusion','InsurTech','RegTech','micro lending',
              'machine learning','big data', 'artificial intelligence','analytics']

## Load XML files from IMF ODBC database using SQL query

In [14]:
def get_xmls(server_name, database_name, query_string):
    CON = pyodbc.connect('Trusted_Connection=yes',
                         driver='{SQL Server}',
                         server='%s' % server_name,
                         database='%s' % database_name)

    CURSOR = CON.cursor()
    return CURSOR.execute(query_string).fetchall()


results = get_xmls(SERVER_NAME, DATABASE_NAME, QUERY_STRING)

In [15]:
print(len(results))
print(results[1].Content)

9
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//IMF//IMF DTD//EN" "../../../../IMF_DTDs_XSLs/journal-dtd-3.0/3.0/journalpublishing3.dtd"[]><?xml-stylesheet type="text/xsl" href="../../../../IMF_DTDs_XSLs/journal-dtd-3.0/journal_ViewIMF-v1.0.xsl"?><article article-type="002" dtd-version="3.0" xml:lang="en" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML"><front><journal-meta><journal-id journal-id-type="publisher-id">002</journal-id><journal-title-group><journal-title>IMF Staff Country Reports</journal-title></journal-title-group><issn>1934-7685</issn><isbn>9781484309100</isbn><publisher><publisher-name>International Monetary Fund</publisher-name><publisher-loc>Washington, D.C.</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.5089/9781484309100.002.A001</article-id><article-id pub-id-type="publisher-id">002A001</article-id><article-categories><subj-group subj-group-type="heading"><subje

## Count frequency of words from the search List

In [4]:
def count_words(word, document):
    "return frequency of word appearing in document"
    return document.count(word)

v_count_words = np.vectorize(count_words)

In [5]:
count_all = [v_count_words(searchList, doc.Content) for doc in results]
#v_count_words(searchList, results[1].Content)
df = pd.DataFrame(data = count_all, columns = searchList)
output = df.sum()
output = pd.DataFrame(data = output, columns =['Freq'])
output.sort_values('Freq', ascending= False)

Unnamed: 0,Freq
mobile,7
correspondent bank,4
digital,2
innovation,2
technology,1
financial inclusion,1
decentralization,1
FinTech,0
artificial intelligence,0
big data,0


In [8]:
results[0]

(90, 24436, 45065, 9781484309100, 9781484309131, 'Somalia : Second and Final Review Under the Staff-Monitored Program and Request for a New Staff-Monitored Program-Press Release and Staff Report', 'International Monetary Fund. Middle East and Central Asia Dept.', 'INTERNATIONAL MONETARY FUND', datetime.datetime(2017, 7, 11, 0, 0), 2018, 'Recent developments are broadly in line with the 2016\nArticle IV Consultation Staff Report. The 2016 electoral cycle was completed on February\n8, with the election of Mohamed Abdullahi Mohamed as President, which provides a\nfresh mandate for stronger reforms in the next four years and continued donor support.\nProgress is being made in improving the security situation, developing institutional\ncapacity, and state-building. On May 16, 2016, the IMF Managing Director approved a\n12-month Staff-Monitored Program (SMP) (May 2016?April 2017) and the first review\nunder the SMP was completed on February 3, 2017. This is the second and last review\nunder 