# Homework 3 -- Cloropleth map of grants in Swiss universities

In [1]:
# imports and all that
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os.path
import os, random
import time

### Data wrangling

In [2]:
grants = pd.read_csv("data/P3_GrantExport.csv", sep=';', index_col=0)
grants.size
grants.shape

(48464, 8)

We'll try to reduce the dataset by removing the entries that are not in CH and irrelevant columns. The foreign institutes don't have a valid entry in `University` field

These changes can be safely persisted in the file, as it will not influence the data we work with.

In [3]:
grants = grants[(grants.University.notnull()) & (grants.University != "Nicht zuteilbar - NA")]
grants = grants.drop(axis=1, errors='ignore', labels=['Project Title', 'Project Title English',
            'Responsible Applicant', 'Funding Instrument', 'Funding Instrument Hierarchy', 'Keywords'])

grants.dtypes
grants.shape

(48283, 8)

Some entries in the Approved Amount column are not numeric so they are being dropped

In [4]:
grants['Approved Amount'] = pd.to_numeric(grants['Approved Amount'], errors='coerce')
grants = grants[grants['Approved Amount'].notnull()]
grants.shape

(48283, 8)

In [5]:
# Run if you want to update the csv
#grants.to_csv("data/P3_GrantExport.csv", sep=';') # keep the same sep

We will group the results on university and combinate those with the NPO category that cannot be grouped on university.
This will allow us to llopu either the university or the institute to find a location.

In [6]:
import re
def getLocationTagFromUni(s):
    return re.findall(r'\S+', s)[-1]

In [7]:
grants['Location Tag'] = grants['University'].apply(getLocationTagFromUni)
universities = grants.groupby(['Location Tag', 'University'])
universities = universities.agg({'Approved Amount':'sum'}).reset_index()
universities = universities[~universities.University.str.startswith("NPO")]
universities.columns = ['Location Tag', 'Institution', 'Approved Amount']
universities.shape

(75, 3)

In [8]:
institutes = grants[grants.University.str.startswith("NPO")]
institutes = institutes.groupby('Institution').agg({'Approved Amount':'sum'}).reset_index()
institutes.shape

(450, 2)

Combine the two dataframes

In [9]:
frame = pd.concat([universities, institutes])

### Find location of institutes

Use Google API to find the location corresponding to the university. This will only run once and save the results of the requests into a file.

In [10]:
google_key="0" #todo insert google api key
data = 'data/requests.txt'
url = "https://maps.googleapis.com/maps/api/place/textsearch/xml?key="+google_key+"&query="

def get_city(row):
    r = requests.get(url+row.Institution)
    with open(data, "a") as myfile:
        myfile.write(r.text)
                
if not os.path.isfile(data): 
    print("Creating file %s" % data)
    with open(data, "a") as myfile:
        myfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?><results>")
    frame.apply(get_city, axis=1)
    with open(data, "a") as myfile:
        myfile.write("</results>")
else:
    print("File already exists %s" %data)

File already exists data/requests.txt


Load the results for the google requests and filter out the postalcodes of the Universities location. This will later be used to find the canton.

In [18]:
results = BeautifulSoup(open(data, encoding="latin_1"), "xml")
frame['postalcode'] = 'None'
i = 0
for response in results.find_all('PlaceSearchResponse'):
    if i > 75:
        print(frame.iloc[i].Institution)
        print(response)
    result = response.find('result')
    if result != None:
        city = result.find('formatted_address').text
        if "Switzerland" in city:
            pc = str(city[city.index(',')+2:city.index(',')+6])
            if (pc.isdigit()):
                frame.set_value(i, 'postalcode', pc)
            else:
                frame.set_value(i, 'postalcode', city)
    i += 1
frame.head()

AGRIDEA
<PlaceSearchResponse>
<status>ZERO_RESULTS</status>
</PlaceSearchResponse>
Aargauer Kantonsbibliothek
<PlaceSearchResponse>
<status>OK</status>
<result>
<name>AGRIDEA</name>
<type>point_of_interest</type>
<type>establishment</type>
<formatted_address>Eschikon 28, 8315 Lindau, Switzerland</formatted_address>
<geometry>
<location>
<lat>47.4479074</lat>
<lng>8.6829233</lng>
</location>
<viewport>
<southwest>
<lat>47.4475511</lat>
<lng>8.6824247</lng>
</southwest>
<northeast>
<lat>47.4489763</lat>
<lng>8.6830895</lng>
</northeast>
</viewport>
</geometry>
<icon>https://maps.gstatic.com/mapfiles/place_api/icons/generic_business-71.png</icon>
<reference>CmRSAAAA3iRwnt1u3FZfb3G-TbLe2dI7NBdNkSLsCGmXpVxT_qEUvkfBYIDLVhe5qnYQpV0zHK5hkLxXmoOnEDV2x_vnOt-tUJY1WuYdK1g-djtAKsjWkmWZC_Wgb5qwwkIXhOwPEhD2axgGkK9s5Ehj9BNOAIDkGhTAXX7-i84aK_eFgAoNVoHuevvCrQ</reference>
<id>5f7b7dec370addb43220cccdf4d7d0995a3c04f6</id>
<opening_hours>
<open_now>false</open_now>
</opening_hours>
<photo>
<photo_reference

Unnamed: 0.1,Unnamed: 0,Approved Amount,Institution,Location Tag,postalcode,canton
0,0,33115720.0,Forschungsanstalten Agroscope - AGS,AGS,,
1,1,3435621.0,AO Research Institute - AORI,AORI,,
2,2,159317.0,Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP,ASP,,
3,3,10749810.0,Weitere Spitäler - ASPIT,ASPIT,,
4,4,1519373000.0,Universität Bern - BE,BE,3012.0,BE


For the universities that were found we will use the geonames api to find the canton based on the postalcode.

In [15]:
username = "juta" #todo add username
url = "http://api.geonames.org/postalCodeSearch?maxRows=1&username="+username+"&country=ch&postalcode="

def get_canton(row):
    if row.postalcode.isdigit():
        r = requests.get(url+row.postalcode)
        soup = BeautifulSoup(r.text, "xml")
        if soup.find('adminCode1'):
            return soup.find('adminCode1').text
        else:
            return 'None'
    else:
        return 'None'

frame['canton'] = frame.apply(get_canton, axis=1)
frame.head(10)

Unnamed: 0.1,Unnamed: 0,Approved Amount,Institution,Location Tag,postalcode,canton
75,0,663264.0,AGGS Staatsarchiv,,,
76,1,911248.0,AGRIDEA,,,
77,2,5553504.0,Aargauer Kantonsbibliothek,,,
78,3,12380.0,Aargauer Kantonsschule Baden,,,
79,4,1000122.0,Abteilung Biochemie Biozentrum Universität Basel,,3012.0,BE
80,5,230218.0,Abteilung Geowissenschaften Naturhistorisches ...,,3401.0,BE
81,6,188000.0,Abteilung Mikrobiologie Biozentrum Universität...,,8280.0,TG
82,7,180269.0,Abwasserverband Altenrhein,,4003.0,BS
83,8,504497.0,Addiction Info Suisse,,1950.0,VS
84,9,4000.0,Agrogen-Stiftung,,,


We save the information found to a file in order to manually add some missing locations

In [13]:
universities_data = "data/universities.csv"
if not os.path.isfile(universities_data):
    frame.to_csv(universities_data, sep=',', encoding='latin_1')
frame = pd.read_csv(universities_data, sep=',', encoding='latin_1')

In [14]:
df = frame.groupby(['canton'])
df = df.agg({'Approved Amount':'sum'}).reset_index()
df = df[df.canton != 'None']
df.to_csv("data/grants_per_canton.csv", sep=';')
df

Unnamed: 0,canton,Approved Amount
0,AG,1197269.0
1,BE,1579795000.0
2,BL,681613.0
3,BS,1399702000.0
4,FR,460107100.0
5,GE,13394950.0
6,GR,18031260.0
7,JU,34873550.0
8,LU,57114230.0
9,NE,301722.0
