In [1]:
# imports (may be more than needed)
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import itertools
import requests
from bs4 import BeautifulSoup
import json

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'


In [2]:
data = pd.read_csv('data/GrantExport.csv', delimiter=';')
data.dtypes

﻿"Project Number"                int64
Project Title                   object
Project Title English           object
Responsible Applicant           object
Funding Instrument              object
Funding Instrument Hierarchy    object
Institution                     object
University                      object
Discipline Number                int64
Discipline Name                 object
Discipline Name Hierarchy       object
Start Date                      object
End Date                        object
Approved Amount                 object
Keywords                        object
dtype: object

In [3]:
data.head()

Unnamed: 0,"﻿""Project Number""",Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,Start Date,End Date,Approved Amount,Keywords
0,1,Schlussband (Bd. VI) der Jacob Burckhardt-Biog...,,Kaegi Werner,Project funding (Div. I-III),Project funding,,Nicht zuteilbar - NA,10302,Swiss history,Human and Social Sciences;Theology & religious...,01.10.1975,30.09.1976,11619.0,
1,4,Batterie de tests à l'usage des enseignants po...,,Massarenti Léonard,Project funding (Div. I-III),Project funding,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,10104,Educational science and Pedagogy,"Human and Social Sciences;Psychology, educatio...",01.10.1975,30.09.1976,41022.0,
2,5,"Kritische Erstausgabe der ""Evidentiae contra D...",,Kommission für das Corpus philosophorum medii ...,Project funding (Div. I-III),Project funding,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",10101,Philosophy,Human and Social Sciences;Linguistics and lite...,01.03.1976,28.02.1985,79732.0,
3,6,Katalog der datierten Handschriften in der Sch...,,Burckhardt Max,Project funding (Div. I-III),Project funding,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,10302,Swiss history,Human and Social Sciences;Theology & religious...,01.10.1975,30.09.1976,52627.0,
4,7,Wissenschaftliche Mitarbeit am Thesaurus Lingu...,,Schweiz. Thesauruskommission,Project funding (Div. I-III),Project funding,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",10303,Ancient history and Classical studies,Human and Social Sciences;Theology & religious...,01.01.1976,30.04.1978,120042.0,


In [4]:
# take only the relevant cols and give them nicer names
grants = data[['University', 'Approved Amount']]
grants.rename(columns={'University': 'university', 'Approved Amount' : 'amount'}, inplace=True)
grants.dtypes

university    object
amount        object
dtype: object

In [5]:
# drop nulls
len(grants)

63969

In [6]:
grants = grants.replace(to_replace='Nicht zuteilbar - NA', value=np.nan)
grants = grants.dropna()
len(grants)

48393

In [7]:
grants['amount'] = pd.to_numeric(grants.amount, errors='coerce')
grants.dtypes


university     object
amount        float64
dtype: object

In [8]:
len(grants.university.unique())

76

In [9]:
grants.university.unique()

array(['Université de Genève - GE',
       'NPO (Biblioth., Museen, Verwalt.) - NPO', 'Universität Basel - BS',
       'Université de Fribourg - FR', 'Universität Zürich - ZH',
       'Université de Lausanne - LA', 'Universität Bern - BE',
       'Eidg. Forschungsanstalt für Wald,Schnee,Land - WSL',
       'Université de Neuchâtel - NE', 'ETH Zürich - ETHZ',
       'Inst. de Hautes Etudes Internat. et du Dév - IHEID',
       'Universität St. Gallen - SG', 'Weitere Institute - FINST',
       'Firmen/Privatwirtschaft - FP',
       'Pädagogische Hochschule Graubünden - PHGR', 'EPF Lausanne - EPFL',
       'Pädagogische Hochschule Zürich - PHZFH', 'Universität Luzern - LU',
       'Schweiz. Institut für Kunstwissenschaft - SIK-ISEA',
       'SUP della Svizzera italiana - SUPSI',
       'HES de Suisse occidentale - HES-SO',
       'Robert Walser-Stiftung Bern - RWS', 'Paul Scherrer Institut - PSI',
       'Pädagogische Hochschule St. Gallen - PHSG',
       'Eidg. Anstalt für Wasserversorgun

In [10]:
# make the universities data frame
universities_array = grants.university.unique()
universities = pd.DataFrame(universities_array, columns=['full_name'])

universities['university_name'] = [fn.split('-')[0].strip() for fn in universities['full_name'].values]
universities['abbrev'] = [fn.split('-')[1].strip() if len(fn.split('-')) > 1 else np.nan for fn in universities['full_name'].values]
universities.head()

Unnamed: 0,full_name,university_name,abbrev
0,Université de Genève - GE,Université de Genève,GE
1,"NPO (Biblioth., Museen, Verwalt.) - NPO","NPO (Biblioth., Museen, Verwalt.)",NPO
2,Universität Basel - BS,Universität Basel,BS
3,Université de Fribourg - FR,Université de Fribourg,FR
4,Universität Zürich - ZH,Universität Zürich,ZH


In [11]:
# try the geonames REST service
def find_canton_geonames(name):    
    url_search = 'http://api.geonames.org/searchJSON'
    params = {
        'name': name,
        'country': 'CH',
        'username': 'ada_account',
        'formatted': 'true',
        'type': 'json',
        'style': 'FULL'
    }

    r = requests.get(url_search, params=params)
    answer = json.loads(r.text)
    ret = np.nan
    if answer['totalResultsCount'] != 0:
        #take the first since they are ordered by relevance
        ret = answer['geonames'][0]['adminCode1']
    print('geoname --> '+str(ret))
    return ret
    
    
    

In [12]:
cantons = {
    'AG': ['Aargau'],
    'AR': ['Appenzell Ausserrhoden'], 
    'AI': ['Appenzell Innerrhoden'], 
    'BL': ['Basel-Land', 'Basel Land'], 
    'BS': ['Basel-Stadt', 'Basel Stadt'], 
    'BE': ['Bern'], 
    'FR': ['Fribourg', 'Freiburg'] ,  
    'GE': ['Genève', 'Genf'], 
    'GL': ['Glarus'], 
    'GR': ['Graubünden', 'Grischuns', 'Grigioni'],  
    'JU': ['Jura'],  
    'LU': ['Luzern'], 
    'NE': ['Neuchâtel', 'Neuenburg'], 
    'NW': ['Nidwalden'], 
    'OW': ['Obwalden'], 
    'SG': ['St.Gallen', 'St. Gallen'], 
    'SH': ['Schaffhausen'], 
    'SZ': ['Schwyz'], 
    'SO': ['Solothurn'], 
    'TG': ['Thurgau'], 
    'TI': ['Ticino', 'Tessin'], 
    'UR': ['Uri'], 
    'VD': ['Vaud', 'Waadt'], 
    'VS': ['Valais', 'Wallis'], 
    'ZG': ['Zug'],
    'ZH': ['Zürich']
    }

In [13]:
# Check if the uni name contains the canton name -> big chance that the uni is in that canton
def find_canton_substring(name):
    for (canton_abbrev, canton_names) in cantons.items():
        for cn in canton_names:
            if cn in name:
                print('substring --> '+str(canton_abbrev))
                return canton_abbrev
    print('substring --> '+str(np.nan))
    return np.nan

In [14]:
# combine the two methods
def find_canton(name):
    print('***********************************')
    print('searching for: "'+name+'"')
    geonames = find_canton_geonames(name)
    substr = find_canton_substring(name)
    if pd.isnull(geonames):
        return substr
    elif pd.isnull(substr):
        return geonames
    else:
        #both are non null
        return geonames # I trust geonames more than the substr method. Is this OK?
    

In [15]:
# create a new column with the canton in it.
universities['canton'] = universities.apply(lambda uni: find_canton(uni.university_name), axis=1)
len(universities['canton']) - sum(pd.isnull(universities['canton']))

***********************************
searching for: "Université de Genève"
geoname --> nan
substring --> GE
***********************************
searching for: "NPO (Biblioth., Museen, Verwalt.)"
geoname --> nan
substring --> nan
***********************************
searching for: "Universität Basel"
geoname --> BS
substring --> nan
***********************************
searching for: "Université de Fribourg"
geoname --> nan
substring --> FR
***********************************
searching for: "Universität Zürich"
geoname --> ZH
substring --> ZH
***********************************
searching for: "Université de Lausanne"
geoname --> VD
substring --> nan
***********************************
searching for: "Universität Bern"
geoname --> BE
substring --> BE
***********************************
searching for: "Eidg. Forschungsanstalt für Wald,Schnee,Land"
geoname --> nan
substring --> nan
***********************************
searching for: "Université de Neuchâtel"
geoname --> nan
substring --> NE
**

27

In [16]:
universities[pd.isnull(universities['canton'])]

Unnamed: 0,full_name,university_name,abbrev,canton
1,"NPO (Biblioth., Museen, Verwalt.) - NPO","NPO (Biblioth., Museen, Verwalt.)",NPO,
7,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...","Eidg. Forschungsanstalt für Wald,Schnee,Land",WSL,
10,Inst. de Hautes Etudes Internat. et du Dév - I...,Inst. de Hautes Etudes Internat. et du Dév,IHEID,
12,Weitere Institute - FINST,Weitere Institute,FINST,
13,Firmen/Privatwirtschaft - FP,Firmen/Privatwirtschaft,FP,
15,EPF Lausanne - EPFL,EPF Lausanne,EPFL,
18,Schweiz. Institut für Kunstwissenschaft - SIK-...,Schweiz. Institut für Kunstwissenschaft,SIK,
19,SUP della Svizzera italiana - SUPSI,SUP della Svizzera italiana,SUPSI,
20,HES de Suisse occidentale - HES-SO,HES de Suisse occidentale,HES,
21,Robert Walser-Stiftung Bern - RWS,Robert Walser,Stiftung Bern,
