# Homework 3 -- Cloropleth map of grants in Swiss universities

In [None]:
# imports and all that
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os.path

### Data wrangling

In [None]:
grants = pd.read_csv("data/P3_GrantExport.csv", sep=';', index_col=0)
grants.size
grants.shape

We'll try to reduce the dataset by removing the entries that are not in CH and irrelevant columns.
We also remove data where location cannot be inferred, i.e. which are missing both the institute and the university

These changes can be safely persisted in the file, as it will not influence the data we work with.

In [None]:
grants = grants[grants.University.notnull()]
grants = grants.drop(axis=1, errors='ignore', labels=['Project Title', 'Project Title English',
            'Responsible Applicant', 'Funding Instrument', 'Funding Instrument Hierarchy', 'Keywords'])

# drop rows where location cannot be inferred
filter_no_location = ~((grants.University == "Nicht zuteilbar - NA") & (grants.Institution.isnull()))
grants = grants[filter_no_location]
grants.dtypes
grants.shape

In [None]:
# Sum entries in the Approved Amount column are not numeric and such are being dropped
grants['Approved Amount'] = pd.to_numeric(grants['Approved Amount'], errors='coerce')
grants = grants[grants['Approved Amount'].isnull() == False]
grants.shape

In [None]:
# Run if you want to update the csv
#grants.to_csv("data/P3_GrantExport.csv", sep=';') # keep the same sep

In [None]:
import re
def getLocationTagFromUni(s):
    return re.findall(r'\S+', s)[-1]

In [None]:
grants['Location Tag'] = grants['University'].apply(getLocationTagFromUni)
grants.head()

In [None]:
# Grouped by 'University' and the total funds each gets 
grants_grouped_by_uni = grants.groupby(['Location Tag', 'University'])
grants_grouped_by_uni = grants_grouped_by_uni.agg({'Approved Amount':'sum'}).reset_index()
grants_grouped_by_uni.shape

Command line 'voodoo' magic to get the list of cantons from the json file 

In [None]:
cantons =! cat "ch-cantons.topojson.json" | grep "id\":" | cut -d\" -f4

In [None]:
len(cantons)

### Find location of universities

Use Google API to find the location corresponding to the university. This will only run once and save the results of the requests into a file.

In [None]:
google_key="0" #todo insert google api key
data = 'data/requests.txt'
url = "https://maps.googleapis.com/maps/api/place/textsearch/xml?key="+google_key+"&query="

def get_city(row):
    r = requests.get(url+row.University)
    with open(data, "a") as myfile:
        myfile.write(r.text)
                
if not os.path.isfile(data):
    with open(data, "a") as myfile:
        myfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?><results>")
    grants_grouped_by_uni.apply(get_city, axis=1)
    with open(data, "a") as myfile:
        myfile.write("</results>")

Load the results for the google requests and filter out the postalcodes of the Universities location. This will later be used to find the canton.

In [None]:
results = BeautifulSoup(open(data), "xml")
frame = grants_grouped_by_uni.iloc[0:77]
frame['postalcode'] = 'None'
i = 0
for response in results.find_all('PlaceSearchResponse'):
    result = response.find('result')
    if result != None:
        city = result.find('formatted_address').text
        if "Switzerland" in city:
            pc = str(city[city.index(',')+2:city.index(',')+6])
            if (pc.isdigit()):
                frame.set_value(i, 'postalcode', pc)
            else:
                frame.set_value(i, 'postalcode', city)
    i += 1
frame.head(10)

Show the universities that where not found

In [None]:
print(frame[frame.postalcode=='None'].shape)
frame[frame.postalcode=='None']

For the universities that where found we will use the geonames api to find the canton based on the postalcode.

In [None]:
username = "demo" #todo add username
url = "http://api.geonames.org/postalCodeSearch?maxRows=1&username="+username+"&country=ch&postalcode="

def get_canton(row):
    if row.postalcode.isdigit():
        r = requests.get(url+row.postalcode)
        soup = BeautifulSoup(r.text, "xml")
        return soup.find('adminCode1').text
    else:
        return 'None'

frame['canton'] = frame.apply(get_canton, axis=1)
frame.head()

TODO:

* we got 77 location tags and 26 cantons
* so the non-canton locations need to be mapped to cantons
* identify the 'how'
* also split NPO