# Homework 3 -- Cloropleth map of grants in Swiss universities

In [1]:
# imports and all that
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os.path
import os, random
import time

### Data wrangling

In [2]:
grants = pd.read_csv("data/P3_GrantExport.csv", sep=';', index_col=0)
grants.size
grants.shape

(48464, 8)

We'll try to reduce the dataset by removing the entries that are not in CH and irrelevant columns. The foreign institutes don't have a valid entry in `University` field

These changes can be safely persisted in the file, as it will not influence the data we work with.

In [3]:
grants = grants[(grants.University.notnull()) & (grants.University != "Nicht zuteilbar - NA")]
grants = grants.drop(axis=1, errors='ignore', labels=['Project Title', 'Project Title English',
            'Responsible Applicant', 'Funding Instrument', 'Funding Instrument Hierarchy', 'Keywords'])

grants.dtypes
grants.shape

(48283, 8)

Some entries in the Approved Amount column are not numeric so they are being dropped

In [4]:
grants['Approved Amount'] = pd.to_numeric(grants['Approved Amount'], errors='coerce')
grants = grants[grants['Approved Amount'].notnull()]
grants.shape

(48283, 8)

In [5]:
# Run if you want to update the csv
#grants.to_csv("data/P3_GrantExport.csv", sep=';') # keep the same sep

We will group the results on university and combinate those with the NPO category that cannot be grouped on university.
This will allow us to llopu either the university or the institute to find a location.

In [47]:
universities = grants.groupby('University')
universities = universities.agg({'Approved Amount':'sum'}).reset_index()
universities = universities[~universities.University.str.startswith("NPO")]
universities.columns = ['Institution', 'Approved Amount']
universities.shape

(75, 2)

In [48]:
institutes = grants[grants.University.str.startswith("NPO")]
institutes = institutes.groupby('Institution').agg({'Approved Amount':'sum'}).reset_index()
institutes.shape

(450, 2)

Combine the two dataframes

In [50]:
frame = pd.concat([universities, institutes])

### Find location of institutes

Use Google API to find the location corresponding to the university. This will only run once and save the results of the requests into a file.

In [7]:
google_key="0" #todo insert google api key
data = 'data/requests.txt'
url = "https://maps.googleapis.com/maps/api/place/textsearch/xml?key="+google_key+"&query="

def get_city(row):
    r = requests.get(url+row.University)
    with open(data, "a") as myfile:
        myfile.write(r.text)
                
if not os.path.isfile(data):
    with open(data, "a") as myfile:
        myfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?><results>")
    grants_grouped_by_uni.apply(get_city, axis=1)
    with open(data, "a") as myfile:
        myfile.write("</results>")

Load the results for the google requests and filter out the postalcodes of the Universities location. This will later be used to find the canton.

In [8]:
results = BeautifulSoup(open(data), "xml")
frame = grants_grouped_by_uni.copy()
frame['postalcode'] = 'None'
i = 0
for response in results.find_all('PlaceSearchResponse'):
    result = response.find('result')
    if result != None:
        city = result.find('formatted_address').text
        if "Switzerland" in city:
            pc = str(city[city.index(',')+2:city.index(',')+6])
            if (pc.isdigit()):
                frame.set_value(i, 'postalcode', pc)
            else:
                frame.set_value(i, 'postalcode', city)
    i += 1
frame.head(10)

Unnamed: 0,University,Approved Amount,postalcode
0,AO Research Institute - AORI,3435621.0,
1,Allergie- und Asthmaforschung - SIAF,19169960.0,
2,Berner Fachhochschule - BFH,31028700.0,
3,Biotechnologie Institut Thurgau - BITG,2492535.0,
4,Centre de rech. sur l'environnement alpin - CR...,1567678.0,3012.0
5,EPF Lausanne - EPFL,1175316000.0,3401.0
6,ETH Zürich - ETHZ,1635597000.0,8280.0
7,Eidg. Anstalt für Wasserversorgung - EAWAG,74619220.0,4003.0
8,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...",48360390.0,1950.0
9,Eidg. Hochschulinstitut für Berufsbildung - EHB,2086572.0,


For the universities that were found we will use the geonames api to find the canton based on the postalcode.

In [10]:
username = "demo" #todo add username
url = "http://api.geonames.org/postalCodeSearch?maxRows=1&username="+username+"&country=ch&postalcode="

def get_canton(row):
    if row.postalcode.isdigit():
        r = requests.get(url+row.postalcode)
        soup = BeautifulSoup(r.text, "xml")
        if soup.find('adminCode1'):
            return soup.find('adminCode1').text
        else:
            return 'None'
    else:
        return 'None'

frame['canton'] = frame.apply(get_canton, axis=1)
frame.head(10)

Unnamed: 0,University,Approved Amount,postalcode,canton
0,AO Research Institute - AORI,3435621.0,,
1,Allergie- und Asthmaforschung - SIAF,19169965.0,,
2,Berner Fachhochschule - BFH,31028695.0,,
3,Biotechnologie Institut Thurgau - BITG,2492535.0,,
4,Centre de rech. sur l'environnement alpin - CR...,1567678.0,3012.0,BE


We save the information found to a file in order to manually add some missing locations

In [54]:
universities_data = "data/universities.csv"
if not os.path.isfile(universities_data):
    frame.to_csv(universities_data, sep=',', encoding='latin_1')
frame = pd.read_csv(universities_data, sep=',', encoding='latin_1')

In [53]:
df = frame.groupby(['canton'])
df = df.agg({'Approved Amount':'sum'}).reset_index()
df = df[df.canton != 'None']
df.to_csv("data/grants_per_canton.csv", sep=';')
df

Unnamed: 0,canton,Approved Amount
0,AG,11131770.0
1,BE,1278820000.0
2,BL,425498.0
3,BS,88064970.0
4,FR,1495811.0
5,GE,1832912000.0
6,GR,71206630.0
7,JU,29065840.0
8,LU,1847691000.0
9,NE,936551.0
