In [1]:
# imports (may be more than needed)
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import itertools
import requests
from bs4 import BeautifulSoup
import json

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'


In [2]:
data = pd.read_csv('data/GrantExport.csv', delimiter=';')
data.dtypes

﻿"Project Number"                int64
Project Title                   object
Project Title English           object
Responsible Applicant           object
Funding Instrument              object
Funding Instrument Hierarchy    object
Institution                     object
University                      object
Discipline Number                int64
Discipline Name                 object
Discipline Name Hierarchy       object
Start Date                      object
End Date                        object
Approved Amount                 object
Keywords                        object
dtype: object

In [3]:
data.head()

Unnamed: 0,"﻿""Project Number""",Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,Start Date,End Date,Approved Amount,Keywords
0,1,Schlussband (Bd. VI) der Jacob Burckhardt-Biog...,,Kaegi Werner,Project funding (Div. I-III),Project funding,,Nicht zuteilbar - NA,10302,Swiss history,Human and Social Sciences;Theology & religious...,01.10.1975,30.09.1976,11619.0,
1,4,Batterie de tests à l'usage des enseignants po...,,Massarenti Léonard,Project funding (Div. I-III),Project funding,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,10104,Educational science and Pedagogy,"Human and Social Sciences;Psychology, educatio...",01.10.1975,30.09.1976,41022.0,
2,5,"Kritische Erstausgabe der ""Evidentiae contra D...",,Kommission für das Corpus philosophorum medii ...,Project funding (Div. I-III),Project funding,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",10101,Philosophy,Human and Social Sciences;Linguistics and lite...,01.03.1976,28.02.1985,79732.0,
3,6,Katalog der datierten Handschriften in der Sch...,,Burckhardt Max,Project funding (Div. I-III),Project funding,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,10302,Swiss history,Human and Social Sciences;Theology & religious...,01.10.1975,30.09.1976,52627.0,
4,7,Wissenschaftliche Mitarbeit am Thesaurus Lingu...,,Schweiz. Thesauruskommission,Project funding (Div. I-III),Project funding,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",10303,Ancient history and Classical studies,Human and Social Sciences;Theology & religious...,01.01.1976,30.04.1978,120042.0,


In [4]:
# take only the relevant cols and give them nicer names
grants = data[['University', 'Approved Amount']]
grants.rename(columns={'University': 'university', 'Approved Amount' : 'amount'}, inplace=True)
grants.dtypes

university    object
amount        object
dtype: object

In [5]:
# drop nulls
len(grants)

63969

In [6]:
grants = grants.replace(to_replace='Nicht zuteilbar - NA', value=np.nan)
grants = grants.dropna()
len(grants)

48393

In [7]:
grants['amount'] = pd.to_numeric(grants.amount, errors='coerce')
grants.dtypes


university     object
amount        float64
dtype: object

In [8]:
len(grants.university.unique())

76

In [9]:
# make the universities data frame
universities_array = grants.university.unique()
universities = pd.DataFrame(universities_array, columns=['full_name'])

delim = ' - '

universities['university_name'] = [fn.split(delim)[0].strip() for fn in universities['full_name'].values]
universities['abbrev'] = [fn.split(delim)[1].strip() if len(fn.split(delim)) > 1 else np.nan for fn in universities['full_name'].values]
universities

Unnamed: 0,full_name,university_name,abbrev
0,Université de Genève - GE,Université de Genève,GE
1,"NPO (Biblioth., Museen, Verwalt.) - NPO","NPO (Biblioth., Museen, Verwalt.)",NPO
2,Universität Basel - BS,Universität Basel,BS
3,Université de Fribourg - FR,Université de Fribourg,FR
4,Universität Zürich - ZH,Universität Zürich,ZH
5,Université de Lausanne - LA,Université de Lausanne,LA
6,Universität Bern - BE,Universität Bern,BE
7,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...","Eidg. Forschungsanstalt für Wald,Schnee,Land",WSL
8,Université de Neuchâtel - NE,Université de Neuchâtel,NE
9,ETH Zürich - ETHZ,ETH Zürich,ETHZ


In [10]:
# run the python file that defines some functions
%run map_universities.py

In [11]:
# create a new column with the canton in it.
universities['canton'] = universities.apply(lambda uni: canton_for_university(uni.university_name), axis=1)
len(universities['canton']) - sum(pd.isnull(universities['canton']))

10

In [12]:
universities[pd.isnull(universities['canton'])]

Unnamed: 0,full_name,university_name,abbrev,canton
1,"NPO (Biblioth., Museen, Verwalt.) - NPO","NPO (Biblioth., Museen, Verwalt.)",NPO,
7,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...","Eidg. Forschungsanstalt für Wald,Schnee,Land",WSL,
11,Universität St. Gallen - SG,Universität St. Gallen,SG,
12,Weitere Institute - FINST,Weitere Institute,FINST,
13,Firmen/Privatwirtschaft - FP,Firmen/Privatwirtschaft,FP,
14,Pädagogische Hochschule Graubünden - PHGR,Pädagogische Hochschule Graubünden,PHGR,
15,EPF Lausanne - EPFL,EPF Lausanne,EPFL,
16,Pädagogische Hochschule Zürich - PHZFH,Pädagogische Hochschule Zürich,PHZFH,
18,Schweiz. Institut für Kunstwissenschaft - SIK-...,Schweiz. Institut für Kunstwissenschaft,SIK-ISEA,
19,SUP della Svizzera italiana - SUPSI,SUP della Svizzera italiana,SUPSI,


In [14]:
#uni_canton_df = pd.DataFrame.from_dict(uni_canton_mapping, orient='index')
#uni_canton_df.rename(index=str, columns={0: 'canton'}, inplace=True)
#uni_canton_df

### map by hand (using google & wikipedia):
- Schweizer Kompetenzzentrum Sozialwissensch. -> lausanne -> VD
- Weitere Institute -> translates to 'other institutes' -> nan
- Forschungsanstalten Agroscope -> not in one place -> nan
- Haute école pédagogique BE, JU, NE -> situated in JU but belongs to BE, JU & NE -> JU or 1/3 for each?
- Swiss Institute of Bioinformatics -> all over the place -> nan
- Firmen/Privatwirtschaft -> similar to 'other institutions' -> nan
- Forschungsinstitut für Opthalmologie -> in Sitten -> VS
- Eidg. Forschungsanstalt für Wald,Schnee,Land -> all over the place -> nan
- Istituto Svizzero di Roma -> in ROM (italy) -> nan
- Pädag. Hochschule Tessin (Teilschule SUPSI) -> TI
- Pädagogische Hochschule Nordwestschweiz -> office in Windisch -> AG
- Physikal.-Meteorolog. Observatorium Davos -> GR
- Instituto Ricerche Solari Locarno -> TI
- Staatsunabh. Theologische Hochschule Basel -> BS
- Fachhochschule Nordwestschweiz (ohne PH) -> same as 'Pädagogische Hochschule Nordwestschweiz' -> AG
- Forschungskommission SAGW -> does not exist anymore -> nan
- NPO (Biblioth., Museen, Verwalt.) -> several institutions -> nan
- Swiss Center for Electronics and Microtech. -> Neuchâtel -> NE
- Eidg. Material und Prüfungsanstalt -> all over the place (BE, ZH, SG) -> nan
- Weitere Spitäler -> several hospitals -> nan