# Neuromap Project Neurohackademy 2019

In [1]:
# install libraries/ set up script
import pandas as pd
import numpy as np
import folium
import geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="neuromap")

In [2]:
# sets the defaults for viewing the dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
#identify missing values
missing_values = ["  ", "-"]
df = pd.read_csv('NeuroMap-38responses.csv', na_values = missing_values, encoding='latin-1')

#view data
#df.head(20)

In [4]:
# #clean data
# # rename column questions to shorter
df.columns = ['timestamp','consent','name','birthplace','birth_year', 'gender',
              'undergrad_deg','undergrad_loc',  'undergrad_inst','undergrad_research',
              'ra_qual','ra_lm_loc', 'ra_lm_inst', 'ra_lm_research',
              'masters_qual','masters_loc', 'masters_inst', 'masters_research', 
              'phd_qual','phd_loc', 'phd_inst', 'phd_research',
              'post_doc_qual', 'postdoc_loc' ,'postdoc_inst','postdoc_research', 
            'faculty_qual', 'faculty_loc', 'faculty_inst', 'faculty_research',
              'google_scholar']

# change white spaces (blanks) to NaN
#df.replace(r'^\s+$', np.nan, regex=True)

replace_dict_undergrad = {
    "Tijuana, Baja California, MÌ©xico": "Tijuana, Baja California, Mexico",
    'Raleigh & Chapel Hill, North Carolina, USA': 'Chapel Hill, North Carolina, USA'
}
df['undergrad_loc'] = df['undergrad_loc'].replace(replace_dict_undergrad) 

replace_dict_research = {
    'Central Institute of Chemistry and Mechanics': 'Nagatinskaya, Moscow, Russia'
}
df['ra_lm_loc'] = df['ra_lm_loc'].replace(replace_dict_research)

replace_dict_phd = {
    'Rio de Janeiro, Brazil AND Montreal, Canada': 'Rio de Janeiro, Brazil'
}
df['phd_loc'] = df['phd_loc'].replace(replace_dict_phd) 

In [5]:
#Loop through rows to get the longitude and latitude of the hometowns
lat=[]
long=[]
for home_location in df.loc[:,"birthplace"]:
    location_1 = geolocator.geocode(home_location, timeout=10)
    #print((location_1.latitude, location_1.longitude))
    
    lat.append(location_1.latitude)
    long.append(location_1.longitude)

#Saving lat and long in separate columns in the dataframe    
df['HometownLatitude'] = lat
df['HometownLatitude'] = df['HometownLatitude'].astype('float')

df['HometownLongitude'] = long
df['HometownLongitude'] = df['HometownLongitude'].astype('float')
#df.head(20)

In [6]:
#Loop through rows to get the longitude and latitude of the undergrad_cities (be aware of missing data)
lat_list = []
long_list = []
for undergrad_location in df['undergrad_loc']:
    #print(location_1)
    
    if pd.isnull(undergrad_location):
        lat2 = None
        long2 = None
    else:
        location_1 = geolocator.geocode(undergrad_location, timeout=10)
        if location_1 is None:
            raise ValueError("Geocode failed")
        lat2 = location_1.latitude
        long2 = location_1.longitude
    
    lat_list.append(lat2)
    long_list.append(long2)
    #print((lat,long,location))
        
# #Saving lat and long in separate columns in the dataframe    
df['UndergradLatitude' ] = lat_list
df['UndergradLatitude'] = df['UndergradLatitude'].astype('float')

df['UndergradLongitude'] = long_list
df['UndergradLongitude'] = df['UndergradLongitude'].astype('float')
#df.head(40)

In [7]:
# create a map with undergrad locations
df_undergrad = df.loc[:,"UndergradLatitude":"UndergradLongitude"] #create a subset of df to deal with the na problem
df_undergrad = df_undergrad.dropna()
undergrad_locs = df.loc[:,"undergrad_loc"]
#print(df_undergrad)

if None in lat_list: lat_list.remove(None)
if None in long_list: long_list.remove(None)
undergrad_locs = [x for x in undergrad_locs if pd.notna(x)]

In [8]:
# create RA locations 
# Loop through rows to get the longitude and latitude of the RA_locations (be aware of missing data)
lat_list = []
long_list = []
for research_location in df['ra_lm_loc']:
    #print(location_1)
    
    if pd.isnull(research_location):
        lat = None
        long = None
    else:
        location_1 = geolocator.geocode(research_location, timeout=10)
        if location_1 is None:
            raise ValueError("Geocode failed")
        lat = location_1.latitude
        long = location_1.longitude
    
    lat_list.append(lat)
    long_list.append(long)
    print((lat,long,research_location))
        
# #Saving lat and long in separate columns in the dataframe    
df['RA_LM_Latitude' ] = lat_list
df['RA_LM_Latitude'] = df['RA_LM_Latitude'].astype('float')

df['RA_LM_Longitude'] = long_list
df['RA_LM_Longitude'] = df['RA_LM_Longitude'].astype('float')
#df.head(40)


(41.3082138, -72.9250518, 'New Haven, CT, USA')
(None, None, nan)
(-37.8142176, 144.9631608, 'Melbourne, Victoria, Australia')
(52.4775396, -1.894053, 'Birmingham, UK')
(39.9527237, -75.1635262, 'Philadelphia, PA, USA')
(None, None, nan)
(-34.6075616, -58.437076, 'Buenos Aires, Argentina')
(55.6828925, 37.6223775, 'Nagatinskaya, Moscow, Russia')
(39.9527237, -75.1635262, 'Philadelphia, USA')
(40.7127281, -74.0060152, 'New York City, New York, USA')
(None, None, nan)
(None, None, nan)
(25.0375198, 121.5636796, 'Taipei, Taiwan')
(None, None, nan)
(40.7127281, -74.0060152, 'New York, NY')
(None, None, nan)
(None, None, nan)
(None, None, nan)
(None, None, nan)
(None, None, nan)
(None, None, nan)
(37.4443293, -122.1598465, 'Palo Alto, California, USA')
(45.5202471, -122.6741949, 'Portland, OR, USA')
(None, None, nan)
(None, None, nan)
(None, None, nan)
(None, None, nan)
(None, None, nan)
(37.8708393, -122.2728639, 'Berkeley, CA, USA')
(37.7792808, -122.4192363, 'San Francisco, CA, USA')
(35

In [9]:
# create a map with research position locations
df_research = df.loc[:,"RA_LM_Latitude":"RA_LM_Longitude"] #create a subset of df to deal with the na problem
df_research = df_research.dropna()
research_locs = df.loc[:,"ra_lm_loc"]
#print(df_undergrad)

lat_list = list(filter(None, lat_list))
long_list = list(filter(None, long_list))
research_locs = [x for x in research_locs if pd.notna(x)]

In [10]:
# research locations plotting 
research_map = folium.Map()
#Loop through locations and add the markers on the map
for research_location in range(len(research_locs)): 
    folium.Marker([lat_list[research_location], long_list[research_location]], popup=research_locs[research_location]).add_to(research_map)

#display map
#research_map

In [11]:
# create PHD/Doctoral locations 
# Loop through rows to get the longitude and latitude of the RA_locations (be aware of missing data)
lat_list = []
long_list = []
for phd_location in df['phd_loc']:
    #print(location_1)
    
    if pd.isnull(phd_location):
        lat = None
        long = None
    else:
        location_1 = geolocator.geocode(phd_location, timeout=10)
        if location_1 is None:
            raise ValueError("Geocode failed")
        lat = location_1.latitude
        long = location_1.longitude
    
    lat_list.append(lat)
    long_list.append(long)
    print((lat,long,phd_location))
        
# #Saving lat and long in separate columns in the dataframe    
df['PHD_Latitude' ] = lat_list
df['PHD_Latitude'] = df['PHD_Latitude'].astype('float')

df['PHD_Longitude'] = long_list
df['PHD_Longitude'] = df['PHD_Longitude'].astype('float')
#df.head(40)

# create a map with research position locations
df_phd = df.loc[:,"PHD_Longitude":"PHD_Latitude"] #create a subset of df to deal with the na problem
df_phd = df_phd.dropna()
phd_locs = df.loc[:,"phd_loc"]
#print(df_undergrad)

lat_list = list(filter(None, lat_list))
long_list = list(filter(None, long_list))
phd_locs = [x for x in phd_locs if pd.notna(x)]

# research locations plotting 
phd_map = folium.Map()
#Loop through locations and add the markers on the map
for phd_location in range(len(phd_locs)): 
    folium.Marker([lat_list[phd_location], long_list[phd_location]], popup=phd_locs[phd_location]).add_to(phd_map)

#display map    
#phd_map


(40.4416941, -79.9900861, 'Pittsburgh, PA, USA')
(45.886548, 11.0452369, 'Rovereto, Trentino, Italy')
(-37.8142176, 144.9631608, 'Melbourne, Victoria, Australia')
(52.4775396, -1.894053, 'Birmingham, UK')
(39.9527237, -75.1635262, 'Philadelphia, PA, USA')
(38.8950092, -77.0365625, 'Washington, DC')
(50.1106444, 8.6820917, 'Frankfurt, Germany')
(48.8566101, 2.3514992, 'Paris/France')
(32.7174209, -117.1627714, 'San Diego, CA, USA')
(40.3492744, -74.6592958, 'Princeton, NJ, USA')
(47.6038321, -122.3300624, 'Seattle, WA, United States')
(25.7742658, -80.1936589, 'Miami, FL, USA')
(53.9590555, -1.0815361, 'York, UK')
(53.550341, 10.000654, 'Hamburg, Germany')
(40.4416941, -79.9900861, 'Pittsburgh, PA')
(37.5666791, 126.9782914, 'Seoul, South Korea')
(None, None, nan)
(51.5073219, -0.1276474, 'London, United Kingdom')
(52.1518157, 4.48110886662043, 'Leiden, the Netherlands')
(42.3602534, -71.0582912, 'Boston, MA, USA')
(34.4221319, -119.7026673, 'Santa Barbara, California, USA')
(43.703622,

In [12]:
import ipyleaflet as lf
from ipyleaflet import (Map, basemaps, basemap_to_tiles, Circle, LayersControl, FullScreenControl)

m = Map(zoom=1)

# hometown
hometown = []    
for ii in range(len(df)):
    coords = df.HometownLatitude[ii],df.HometownLongitude[ii]
    c = Circle()
    c.radius = 600
    c.location = coords
    c.color = 'blue'
    c.fill = True
    c.name = "Hometown location"
    hometown.append(c)
ht = lf.LayerGroup(name='hometown', layers=hometown)
m.add_layer(ht)  

# undergrad
undergrad = []  
for ii in range(len(df)):
    coords = df.UndergradLatitude[ii],df.UndergradLongitude[ii]
    c = Circle()
    c.radius = 600
    c.location = coords
    c.color = 'green'
    c.fill = True
    c.name = "Undergrad location"
    undergrad.append(c)
ug = lf.LayerGroup(name='undergrad', layers=undergrad)
m.add_layer(ug)

# phd
phd = []  
for ii in range(len(df)):
    coords = df.PHD_Latitude[ii],df.PHD_Longitude[ii]
    c = Circle()
    c.radius = 600
    c.location = coords
    c.color = 'red'
    c.fill = True
    c.name = "PhD location"
    phd.append(c)
p = lf.LayerGroup(name='phd', layers=phd)
m.add_layer(p)
m.add_control(LayersControl())
m

Map(basemap={'url': 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', 'max_zoom': 19, 'attribution': 'Map …