# Homework 3 -- Cloropleth map of grants in Swiss universities

In [145]:
# imports and all that
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os.path

### Data wrangling

In [146]:
grants = pd.read_csv("data/P3_GrantExport.csv", sep=';', index_col=0)
grants.size
grants.shape

(48464, 8)

We'll try to reduce the dataset by removing the entries that are not in CH and irrelevant columns.
We also remove data where location cannot be inferred, i.e. which are missing both the institute and the university

These changes can be safely persisted in the file, as it will not influence the data we work with.

In [147]:
grants = grants[grants.University.notnull()]
grants = grants.drop(axis=1, errors='ignore', labels=['Project Title', 'Project Title English',
            'Responsible Applicant', 'Funding Instrument', 'Funding Instrument Hierarchy', 'Keywords'])

# drop rows where location cannot be inferred
filter_no_location = ~((grants.University == "Nicht zuteilbar - NA") & (grants.Institution.isnull()))
grants = grants[filter_no_location]
grants.dtypes
grants.shape

(48464, 8)

In [148]:
# Sum entries in the Approved Amount column are not numeric and such are being dropped
grants['Approved Amount'] = pd.to_numeric(grants['Approved Amount'], errors='coerce')
grants = grants[grants['Approved Amount'].isnull() == False]
grants.shape

(48464, 8)

In [149]:
# Run if you want to update the csv
#grants.to_csv("data/P3_GrantExport.csv", sep=';') # keep the same sep

In [150]:
import re
def getLocationTagFromUni(s):
    return re.findall(r'\S+', s)[-1]

In [151]:
grants['Location Tag'] = grants['University'].apply(getLocationTagFromUni)
grants.head()

Unnamed: 0_level_0,Institution,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,Start Date,End Date,Approved Amount,Location Tag
"﻿""Project Number""",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,10104,Educational science and Pedagogy,"Human and Social Sciences;Psychology, educatio...",01.10.1975,30.09.1976,41022.0,GE
5,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",10101,Philosophy,Human and Social Sciences;Linguistics and lite...,01.03.1976,28.02.1985,79732.0,NPO
6,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,10302,Swiss history,Human and Social Sciences;Theology & religious...,01.10.1975,30.09.1976,52627.0,BS
7,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",10303,Ancient history and Classical studies,Human and Social Sciences;Theology & religious...,01.01.1976,30.04.1978,120042.0,NPO
8,"Séminaire de politique économique, d'économie ...",Université de Fribourg - FR,10203,Economics,"Human and Social Sciences;Economics, law",01.01.1976,31.12.1978,53009.0,FR


In [152]:
# Grouped by 'University' and the total funds each gets 
grants_grouped_by_uni = grants.groupby(['Location Tag', 'University'])
grants_grouped_by_uni = grants_grouped_by_uni.agg({'Approved Amount':'sum'}).reset_index()
grants_grouped_by_uni.shape

(77, 3)

Command line 'voodoo' magic to get the list of cantons from the json file 

In [153]:
cantons =! cat "ch-cantons.topojson.json" | grep "id\":" | cut -d\" -f4

In [154]:
len(cantons)

2

### Find location of universities

Use Google API to find the location corresponding to the university. This will only run once and save the results of the requests into a file.

In [155]:
google_key="0" #todo insert google api key
data = 'data/requests.txt'
url = "https://maps.googleapis.com/maps/api/place/textsearch/xml?key="+google_key+"&query="

def get_city(row):
    r = requests.get(url+row.University)
    with open(data, "a") as myfile:
        myfile.write(r.text)
                
if not os.path.isfile(data):
    with open(data, "a") as myfile:
        myfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?><results>")
    grants_grouped_by_uni.apply(get_city, axis=1)
    with open(data, "a") as myfile:
        myfile.write("</results>")

Load the results for the google requests and filter out the postalcodes of the Universities location. This will later be used to find the canton.

In [156]:
results = BeautifulSoup(open(data), "xml")
frame = grants_grouped_by_uni.copy()
frame['postalcode'] = 'None'
i = 0
for response in results.find_all('PlaceSearchResponse'):
    result = response.find('result')
    if result != None:
        city = result.find('formatted_address').text
        if "Switzerland" in city:
            pc = str(city[city.index(',')+2:city.index(',')+6])
            if (pc.isdigit()):
                frame.set_value(i, 'postalcode', pc)
            else:
                frame.set_value(i, 'postalcode', city)
    i += 1
frame.head(10)

Unnamed: 0,Location Tag,University,Approved Amount,postalcode
0,AGS,Forschungsanstalten Agroscope - AGS,33115720.0,
1,AORI,AO Research Institute - AORI,3435621.0,
2,ASP,Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP,159317.0,
3,ASPIT,Weitere Spitäler - ASPIT,10749810.0,
4,BE,Universität Bern - BE,1519373000.0,3012.0
5,BFH,Berner Fachhochschule - BFH,31028700.0,3401.0
6,BITG,Biotechnologie Institut Thurgau - BITG,2492535.0,8280.0
7,BS,Universität Basel - BS,1352251000.0,4003.0
8,CREALP,Centre de rech. sur l'environnement alpin - CR...,1567678.0,1950.0
9,CSEM,Swiss Center for Electronics and Microtech. - ...,18068250.0,


Show the universities that where not found

In [157]:
print(frame[frame.postalcode=='None'].shape)
frame[frame.postalcode=='None']

(20, 4)


Unnamed: 0,Location Tag,University,Approved Amount,postalcode
0,AGS,Forschungsanstalten Agroscope - AGS,33115720.0,
1,AORI,AO Research Institute - AORI,3435621.0,
2,ASP,Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP,159317.0,
3,ASPIT,Weitere Spitäler - ASPIT,10749810.0,
9,CSEM,Swiss Center for Electronics and Microtech. - ...,18068250.0,
20,FIBL,Forschungsinstitut für biologischen Landbau - ...,7442410.0,
21,FINST,Weitere Institute - FINST,9256736.0,
23,FORS,Schweizer Kompetenzzentrum Sozialwissensch. - ...,34735820.0,
24,FP,Firmen/Privatwirtschaft - FP,111686700.0,
38,IRO,Forschungsinstitut für Opthalmologie - IRO,3478469.0,


For the universities that where found we will use the geonames api to find the canton based on the postalcode.

In [158]:
username = "juta" #todo add username
url = "http://api.geonames.org/postalCodeSearch?maxRows=1&username="+username+"&country=ch&postalcode="

def get_canton(row):
    if row.postalcode.isdigit():
        r = requests.get(url+row.postalcode)
        soup = BeautifulSoup(r.text, "xml")
        if soup.find('adminCode1'):
            return soup.find('adminCode1').text
        else:
            return 'None'
    else:
        return 'None'

frame['canton'] = frame.apply(get_canton, axis=1)
frame.head()

Unnamed: 0,Location Tag,University,Approved Amount,postalcode,canton
0,AGS,Forschungsanstalten Agroscope - AGS,33115720.0,,
1,AORI,AO Research Institute - AORI,3435621.0,,
2,ASP,Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP,159317.0,,
3,ASPIT,Weitere Spitäler - ASPIT,10749810.0,,
4,BE,Universität Bern - BE,1519373000.0,3012.0,BE


In [159]:
df = frame.groupby(['canton'])
df = df.agg({'Approved Amount':'sum'}).reset_index()
df = df[df.canton != 'None']
df.to_csv("data/grants_per_canton.csv", sep=';')
df

Unnamed: 0,canton,Approved Amount
0,BE,1555048000.0
1,BL,3476142.0
2,BS,1392481000.0
3,FR,459073700.0
4,GE,11583220.0
5,GR,21004260.0
6,JU,34790350.0
7,LU,54673290.0
8,NE,383204600.0
10,SG,86757050.0


TODO:

* we got 77 location tags and 26 cantons
* so the non-canton locations need to be mapped to cantons
* identify the 'how'
* also split NPO