# Homework 3 -- Cloropleth map of grants in Swiss universities

In [1]:
# imports and all that
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os.path

### Data wrangling

In [2]:
grants = pd.read_csv("data/P3_GrantExport.csv", sep=';', index_col=0)
grants.size
grants.shape

(48464, 8)

We'll try to reduce the dataset by removing the entries that are not in CH and irrelevant columns. The foreign institutes don't have a valid entry in `University` field

These changes can be safely persisted in the file, as it will not influence the data we work with.

In [3]:
grants = grants[(grants.University.notnull()) & (grants.University != "Nicht zuteilbar - NA")]
grants = grants.drop(axis=1, errors='ignore', labels=['Project Title', 'Project Title English',
            'Responsible Applicant', 'Funding Instrument', 'Funding Instrument Hierarchy', 'Keywords'])

grants.dtypes
grants.shape

(48283, 8)

Some entries in the Approved Amount column are not numeric so they are being dropped

In [4]:
grants['Approved Amount'] = pd.to_numeric(grants['Approved Amount'], errors='coerce')
grants = grants[grants['Approved Amount'].notnull()]
grants.shape

(48283, 8)

In [None]:
# Run if you want to update the csv
#grants.to_csv("data/P3_GrantExport.csv", sep=';') # keep the same sep

In [5]:
# Grouped by 'University' and the total funds each gets 
grants_grouped_by_uni = grants.groupby('University')
grants_grouped_by_uni = grants_grouped_by_uni.agg({'Approved Amount':'sum'}).reset_index()
grants_grouped_by_uni.shape

(76, 2)

### Find location of universities

Use Google API to find the location corresponding to the university. This will only run once and save the results of the requests into a file.

In [17]:
google_key="0" #todo insert google api key
data = 'data/requests.txt'
url = "https://maps.googleapis.com/maps/api/place/textsearch/xml?key="+google_key+"&limit=1&query="

def get_city(row):
    r = requests.get(url+row.University)
    with open(data, "a") as myfile:
        myfile.write(r.text)
                
if not os.path.isfile(data):
    with open(data, "a") as myfile:
        myfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?><results>")
    grants_grouped_by_uni.apply(get_city, axis=1)
    with open(data, "a") as myfile:
        myfile.write("</results>")

Load the results for the google requests and filter out the postalcodes of the Universities location. This will later be used to find the canton.

In [None]:
results = BeautifulSoup(open(data), "xml")
frame = grants_grouped_by_uni.copy()
frame['postalcode'] = 'None'
i = 0
for response in results.find_all('PlaceSearchResponse'):
    result = response.find('result')
    if result != None:
        city = result.find('formatted_address').text
        if "Switzerland" in city:
            pc = str(city[city.index(',')+2:city.index(',')+6])
            if (pc.isdigit()):
                frame.set_value(i, 'postalcode', pc)
            else:
                frame.set_value(i, 'postalcode', city)
    i += 1
frame.head(10)

Show the universities that where not found

In [None]:
print(frame[frame.postalcode=='None'].shape)
frame[frame.postalcode=='None']

For the universities that were found we will use the geonames api to find the canton based on the postalcode.

In [None]:
username = "juta" #todo add username
url = "http://api.geonames.org/postalCodeSearch?maxRows=1&username="+username+"&country=ch&postalcode="

def get_canton(row):
    if row.postalcode.isdigit():
        r = requests.get(url+row.postalcode)
        soup = BeautifulSoup(r.text, "xml")
        if soup.find('adminCode1'):
            return soup.find('adminCode1').text
        else:
            return 'None'
    else:
        return 'None'

frame['canton'] = frame.apply(get_canton, axis=1)
frame.head()

In [None]:
df = frame.groupby(['canton'])
df = df.agg({'Approved Amount':'sum'}).reset_index()
df = df[df.canton != 'None']
df.to_csv("data/grants_per_canton.csv", sep=';')
df

### Institutes only
These are in Switzerland (mostly), but their University is `NPO` (maybe **N**on **P**ublic **O**rganisation or something like that)

In [9]:
institutes = grants[grants.University.str.startswith("NPO")]
institutes = institutes.groupby('Institution').agg({'Approved Amount':'sum'}).reset_index()
institutes['canton'] = institutes.apply()

Unnamed: 0,Institution,Approved Amount
0,AGGS Staatsarchiv,663264.0
1,AGRIDEA,911248.0
2,Aargauer Kantonsbibliothek,5553504.0
3,Aargauer Kantonsschule Baden,12380.0
4,Abteilung Biochemie Biozentrum Universität Basel,1000122.0


In [16]:
institutes[institutes.Institution.str.contains("Arche")]

Unnamed: 0,Institution,Approved Amount
23,Arche Verlag AG,192273.0


In [45]:
import os, random

google_key=os.environ["GOOGLE_API_KEY"]
data = 'data/institutes.txt'
url = "https://maps.googleapis.com/maps/api/place/textsearch/xml?key="+google_key+"&limit=1&query="

In [47]:
inst2 = institutes.iloc[31:35]
inst2

Unnamed: 0,Institution,Approved Amount
31,Archivio di Stato del Canton Ticino,2469847.0
32,Archäologie Schweiz,68200.0
33,Archäologie Schweizerisches Landesmuseum,520594.0
34,Archäologie und Kantonsmuseum Kanton Basel-Lan...,725233.0


In [46]:
def get_city(row):
    time.sleep(random.uniform(0.7, 1.5))
    r = requests.get(url+row.Institution)
    with open(data, "a") as myfile:
        myfile.write(r.text)

In [44]:
                
# if not os.path.isfile(data):
# print("Creating new file %s" % data)
#     with open(data, "a") as myfile:
#         myfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?><results>")
inst2.apply(get_city, axis=1)
with open(data, "a") as myfile:
    myfile.write("</results>")
# else:
# print("File aready exists %s" % data)