# Homework 3 -- Cloropleth map of grants in Swiss universities

## Part 1: data processing
For visualisation, check the other notebook, or, even better, its `HTML`

In [1]:
# imports and all that
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os.path
import os, random
import time

### Data wrangling

In [2]:
grants = pd.read_csv("data/P3_GrantExport.csv", sep=';', index_col=0)
grants.size
grants.shape

(48464, 8)

We'll try to reduce the dataset by removing the entries that are not in CH and irrelevant columns. The foreign institutes don't have a valid entry in `University` field

These changes can be safely persisted in the file, as it will not influence the data we work with.

In [3]:
grants = grants[(grants.University.notnull()) & (grants.University != "Nicht zuteilbar - NA")]
grants = grants.drop(axis=1, errors='ignore', labels=['Project Title', 'Project Title English',
            'Responsible Applicant', 'Funding Instrument', 'Funding Instrument Hierarchy', 'Keywords'])

grants.dtypes
grants.shape

(48283, 8)

Some entries in the Approved Amount column are not numeric so they are being dropped

In [4]:
grants['Approved Amount'] = pd.to_numeric(grants['Approved Amount'], errors='coerce')
grants = grants[grants['Approved Amount'].notnull()]
grants.shape

(48283, 8)

In [5]:
# Run if you want to update the csv
#grants.to_csv("data/P3_GrantExport.csv", sep=';') # keep the same sep

We will group the results on `university` and combine them with **N**on **P**ublc **O**rganisations (**NPO**s), which don't belong to any university.
This will allow us to use either the university or the institute to find a location.


Also, we get an idea of the location of a university from the last part of its name, which is usually the canton. This improves the results given by Google Maps API

In [6]:
import re
def getLocationTagFromUni(s):
    return re.findall(r'\S+', s)[-1]

In [7]:
grants['Location Tag'] = grants['University'].apply(getLocationTagFromUni)
universities = grants.groupby(['Location Tag', 'University'])
universities = universities.agg({'Approved Amount':'sum'}).reset_index()

# remove the entris of the NPOs
universities = universities[~universities.University.str.startswith("NPO")]

# rename 'University' -> 'Institution' to be the same as institutes
universities.columns = ['Location Tag', 'Institution', 'Approved Amount']
universities.shape

(75, 3)

In [8]:
institutes = grants[grants.University.str.startswith("NPO")]
institutes = institutes.groupby('Institution').agg({'Approved Amount':'sum'}).reset_index()
institutes.shape

(450, 2)

Combine the two dataframes

In [9]:
uni_inst = pd.concat([universities, institutes]).reset_index()

### Find location of institutes

Use Google API to find the location corresponding to the university. This will only run once and save the results of the requests into a file.

In [10]:
google_key = os.environ["GOOGLE_API_KEY"] # provide your own
filename = 'data/requests.txt'
url = "https://maps.googleapis.com/maps/api/place/textsearch/xml?key="+google_key+"&query="

def get_city(row):
    r = requests.get(url+row.Institution)
    with open(filename, "a") as myfile:
        myfile.write(r.text)

In [11]:
if not os.path.isfile(filename): 
    print("Creating file %s" % filename)
    with open(filename, "a") as myfile:
        myfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?><results>")
    uni_inst.apply(get_city, axis=1)
    with open(filename, "a") as myfile:
        myfile.write("</results>")
else:
    print("File already exists %s" %filename)

File already exists data/requests.txt


Load the results for the google requests and filter out the postalcodes of the Universities location. This will later be used to find the canton.

In [12]:
results = BeautifulSoup(open(filename, encoding="latin_1"), "xml")
uni_inst['postalcode'] = 'None'
i = 0
for response in results.find_all('PlaceSearchResponse'):
    result = response.find('result')
    if result != None:
        city = result.find('formatted_address').text
        if "Switzerland" in city:
            pc = str(city[city.index(',')+2:city.index(',')+6])
            if (pc.isdigit()):
                uni_inst.ix[i, 'postalcode'] = pc
            else:
                uni_inst.ix[i, 'postalcode'] = city
    i += 1
uni_inst.head(10)

Unnamed: 0,index,Approved Amount,Institution,Location Tag,postalcode
0,0,33115720.0,Forschungsanstalten Agroscope - AGS,AGS,
1,1,3435621.0,AO Research Institute - AORI,AORI,
2,2,159317.0,Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP,ASP,
3,3,10749810.0,Weitere Spitäler - ASPIT,ASPIT,
4,4,1519373000.0,Universität Bern - BE,BE,3012.0
5,5,31028700.0,Berner Fachhochschule - BFH,BFH,3401.0
6,6,2492535.0,Biotechnologie Institut Thurgau - BITG,BITG,8280.0
7,7,1352251000.0,Universität Basel - BS,BS,4003.0
8,8,1567678.0,Centre de rech. sur l'environnement alpin - CR...,CREALP,1950.0
9,9,18068250.0,Swiss Center for Electronics and Microtech. - ...,CSEM,


For the universities that were found we will use the geonames api to find the canton based on the postalcode.

In [13]:
username = "demo" #todo add username
url = "http://api.geonames.org/postalCodeSearch?maxRows=1&username="+username+"&country=ch&postalcode="

def get_canton(row):
    if row.postalcode.isdigit():
        r = requests.get(url+row.postalcode)
        soup = BeautifulSoup(r.text, "xml")
        if soup.find('adminCode1'):
            return soup.find('adminCode1').text
        else:
            return 'None'
    else:
        return 'None'

uni_inst['canton'] = uni_inst.apply(get_canton, axis=1)
uni_inst.head()

Unnamed: 0,index,Approved Amount,Institution,Location Tag,postalcode,canton
0,0,33115720.0,Forschungsanstalten Agroscope - AGS,AGS,,
1,1,3435621.0,AO Research Institute - AORI,AORI,,
2,2,159317.0,Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP,ASP,,
3,3,10749810.0,Weitere Spitäler - ASPIT,ASPIT,,
4,4,1519373000.0,Universität Bern - BE,BE,3012.0,


We save the information found to a file in order to manually add some missing locations

In [14]:
universities_data = "data/universities.csv"
if not os.path.isfile(universities_data):
    uni_inst.to_csv(universities_data, sep=',', encoding='latin_1')
uni_inst = pd.read_csv(universities_data, sep=',', encoding='latin_1')

In [15]:
df = uni_inst.groupby(['canton'])
df = df.agg({'Approved Amount':'sum'}).reset_index()
df = df[df.canton != 'None']
df.to_csv("data/grants_per_canton.csv", sep=';')
df

Unnamed: 0,canton,Approved Amount
0,AG,139186400.0
1,BE,1560234000.0
2,BL,3476142.0
3,BS,1399188000.0
4,FR,459195500.0
5,GE,1852855000.0
6,GR,37225800.0
7,JU,34790350.0
8,LU,55024060.0
9,NE,401897600.0
