# Imports

In [1]:
import json # to parse the json file
import pandas as pd # because arrays are oldschool (to create dataframes)
from pandas.io.json import json_normalize # for handling nested json
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from geopy.exc import GeocoderTimedOut # geopy usually ends up throwing this exception
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
import time # for delay
from IPython.display import clear_output # to clear notebook output cell via code

# Data Preparation

In [2]:
with open('collegeList.json') as jsonFile: # refers to the csv we created earlier
    raw_data = json.load(jsonFile) # load data to a python var
print('Data Imported.')

Data Imported.


In [3]:
df = json_normalize(raw_data['info']) # Nested json files need to be normalised

### Sub-Key 'venues' is nested inside of info, so we normalize it using json_normalize()

In [4]:
venues = json_normalize(data = raw_data['info'], record_path = 'venues')
venues.drop('display_area', axis = 1, inplace = True) # this column serves no purpose whatsoever

### Lets check if we successfully extracted the venues data

In [5]:
venues.head() 

Unnamed: 0,area,city,country
0,California,San Francisco,United States
1,"Washington, D.C.","Washington, D. C.",United States
2,New York,Buffalo,United States
3,England,London,United Kingdom
4,Wales,Pontypridd,United Kingdom


### Merging df and venues

In [6]:
df = pd.merge(df, venues, right_index=True, left_index=True)

<br><br><br><br>

# Data Cleaning

In [7]:
df.columns # lets revise the columns we have

Index(['degree', 'density.fulltime', 'density.parttime', 'enhanced',
       'fulltime_duration.unit', 'fulltime_duration.value', 'id', 'level',
       'listing_type', 'logo', 'methods.blended', 'methods.face2face',
       'methods.online', 'organisation', 'organisation_id',
       'parttime_duration.unit', 'parttime_duration.value', 'summary', 'title',
       'tuition_fee.currency', 'tuition_fee.unit', 'tuition_fee.value',
       'venues', 'area', 'city', 'country'],
      dtype='object')

In [8]:
# We seriously dont need these columns they are just clutter that we got from json we parsed
columns_to_drop = ['degree', 'density.fulltime', 'density.parttime', 
    'enhanced', 'id', 'level', 'listing_type', 'logo', 'methods.blended',
    'methods.face2face', 'methods.online','parttime_duration.unit', 
    'parttime_duration.value', 'summary', 'title', 'venues', 'fulltime_duration.value', 'fulltime_duration.unit']
df.drop(columns_to_drop, axis = 1,inplace = True)

In [9]:
df.isna().sum()

organisation             0
organisation_id          0
tuition_fee.currency    80
tuition_fee.unit        80
tuition_fee.value       80
area                     1
city                     0
country                  0
dtype: int64

In [10]:
df[df['tuition_fee.currency'] != 'EUR'].count()

organisation            80
organisation_id         80
tuition_fee.currency     0
tuition_fee.unit         0
tuition_fee.value        0
area                    80
city                    80
country                 80
dtype: int64

In [11]:
df = df[df['tuition_fee.currency'].notnull()] #new df from current df where tuition_fee.currency is not null 
df = df[df['area'].notnull()]

According to https://www.geteducated.com/career-center/detail/what-is-a-masters-degree,
To earn a master’s degree you usually need to complete from 36 to 54 semester credits of study (or 60 to 90 quarter-credits). This equals 12 to 18 college courses. 

45 is average of 36 ad 54!

In [12]:
df.loc[df['tuition_fee.unit'] == 'credit', 'tuition_fee.value'] = (df['tuition_fee.value']*45)/2 
# Multiplying tuition_fee.value by 45 when tuition_fee.unit is 'credit' 
# This gives us average per year fees, to get a uniform fee scale (all fees in per year format)

In [13]:
df.drop(['tuition_fee.currency', 'tuition_fee.unit'], axis=1, inplace=True) 
# since we have uniform values we dont need the currency and unit thus we will drrop them

In [14]:
# creating a column with full address and dropping the seperate columns
# df['location'] = df['area'].map(str)+', '+df['city']+', '+df['country']
# df.drop(['area', 'city', 'country'],axis=1, inplace=True)

In [15]:
df = df.rename(columns = {'tuition_fee.value': 'fees', 'organisation': 'college_name', 'organisation_id': 'id'})

In [16]:
# rearranging the columns
df = df[['id','college_name', 'fees', 'area', 'city', 'country']] # removed location from here on date 20191023

In [17]:
df.head()

Unnamed: 0,id,college_name,fees,area,city,country,location
0,16651,School of Nursing and Health Professions,29047.5,California,San Francisco,United States,"California, San Francisco, United States"
1,14338,"Kogod School of Business, American University ...",35730.0,"Washington, D.C.","Washington, D. C.",United States,"Washington, D.C., Washington, D. C., United St..."
2,14726,"University at Buffalo, The State University of...",20857.0,New York,Buffalo,United States,"New York, Buffalo, United States"
3,19026,School of Management,21419.0,England,London,United Kingdom,"England, London, United Kingdom"
4,194,University of South Wales,15514.0,Wales,Pontypridd,United Kingdom,"Wales, Pontypridd, United Kingdom"


In [None]:
def do_geocode(address):
    try:
        return geolocator.geocode(address)
    except GeocoderTimedOut:
        time.sleep(3)
        return do_geocode(address)
    
def get_latlong():
    latitude = []
    longitude = []
    collegeName = df['college_name'].to_list()
    loc = df['location'].to_list()
    
    for i in range(len(collegeName)): 
        address = collegeName[i] +', '+loc[i]
        geolocator = Nominatim(user_agent="foursquare_agent")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
        new['location'] = df['name'].apply(geocode)
        location = do_geocode(address)
        try:# try with college name in address
            latitude.append(location.latitude)
            longitude.append(location.longitude)
        except AttributeError: # if location not found lat long are null 
            try: # try again without the college name in address
                address = loc[i]
                location = do_geocode(address)

                latitude.append(location.latitude)
                longitude.append(location.longitude)
            except: # if an address is not found at all simply write None to list
                latitude.append(None)
                longitude.append(None)
        clear_output() # if we dont too much space taken by output
        print('Obtained ' +str(i)+ ' latitude and logitudes.')  # Tests your patience

In [18]:
df['clg_city'] = df['college_name'].map(str)+' '+df['city']

In [19]:
tqdm.pandas()
def get_latlong():
    geolocator = Nominatim(user_agent="foursquare_agent")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    df['address'] = df['clg_city'].progress_apply(geocode)

In [None]:
get_latlong() # if function gets stuck, manually stop kernel and execute the block below.

In [26]:
import gmplot

IndexError: list index out of range

<br>

In [25]:
df

Unnamed: 0,id,college_name,fees,area,city,country,location,clg_city,address
0,16651,School of Nursing and Health Professions,29047.5,California,San Francisco,United States,"California, San Francisco, United States",School of Nursing and Health Professions San F...,
1,14338,"Kogod School of Business, American University ...",35730.0,"Washington, D.C.","Washington, D. C.",United States,"Washington, D.C., Washington, D. C., United St...","Kogod School of Business, American University ...",
2,14726,"University at Buffalo, The State University of...",20857.0,New York,Buffalo,United States,"New York, Buffalo, United States","University at Buffalo, The State University of...",
3,19026,School of Management,21419.0,England,London,United Kingdom,"England, London, United Kingdom",School of Management London,
4,194,University of South Wales,15514.0,Wales,Pontypridd,United Kingdom,"Wales, Pontypridd, United Kingdom",University of South Wales Pontypridd,"(Bike Shed, Central Avenue, Pontypridd, Hawtho..."
6,228,Middlesex University,13338.0,England,London,United Kingdom,"England, London, United Kingdom",Middlesex University London,"(Middlesex University, Greyhound Hill, Hendon,..."
7,434,University of Northampton,14762.0,England,Northampton,United Kingdom,"England, Northampton, United Kingdom",University of Northampton Northampton,"(The Pavillion, Boughton Green Road, Hill Top,..."
8,11437,Gannon University,21645.0,Pennsylvania,Erie,United States,"Pennsylvania, Erie, United States",Gannon University Erie,"(Gannon University, West 7th Street, Erie, Eri..."
9,307,Royal Holloway University of London,21419.0,England,London,United Kingdom,"England, London, United Kingdom",Royal Holloway University of London London,"(Royal Holloway, Founder's Steam Tunnel, Riple..."
10,12124,Wright State University,22860.0,Ohio,Dayton,United States,"Ohio, Dayton, United States",Wright State University Dayton,


In [21]:
# execute this block if get_latlong() gets stuck
with open('latFile.txt','a+') as latFile:
    latFile.write(str(latitude)+',')
with open('longFile.txt','a+') as longFile:
    longFile.write(str(longitude)+',')

NameError: name 'latitude' is not defined

<br>

## Making new columns in df from latFile and longFile.

<br>

In [None]:
with open('latFile.txt') as latFile:
    latitude = latFile.read()
with open('longFile.txt') as longFile:
    longitude = longFile.read()
latitude= latitude.strip('][').split(', ') #converting latitude from string to float
longitude= longitude.strip('][').split(', ') #converting latitude from string to float

In [None]:
for i in range(len(latitude)):
    try:
        latitude[i] = float(latitude[i])
    except ValueError:
        latitude[i] = None
for i in range(len(longitude)):
    try:
        longitude[i] = float(longitude[i])
    except ValueError:
        longitude[i] = None

In [None]:
df = df[df['area'].notnull()]

In [None]:
df.head()

In [None]:
df.to_csv('college_dataset.csv')

<br>

## We have saved the data to college_dataset.csv 

<br>


In [None]:
df = pd.read_csv("college_dataset.csv") 

In [None]:
df = df[df['latitude'].notnull()] #new df from current df where tuition_fee.currency is not null
df = df[df['longitude'].notnull()]

In [None]:
df.head()

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.shape

<br><br><br><br><br>


# Exploratory Data Analysis

So now that we have our data it's time to explore it. Lets see the number per country. The venues data frame would make it easy to do so.

In [None]:
df.head()

In [None]:
df['country'].value_counts()

## So, I have no clue but somehow all these countries crept in through our json, We don't need them since they are too less in numbers, we'll keep only US, UK and Canada

In [None]:
df = df[df['country'].str.contains("United States|United Kingdom|Canada")] 
# keeping only the locations with location-country as US, UK, Canada

In [None]:
df.shape

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

In [None]:
import folium

In [None]:
df_usa = df[df['country'] == 'United States']
df_usa.shape

In [None]:
usa_coordinates = [37.0902, -95.7129]
USAmap = folium.Map(location=usa_coordinates, zoom_start = 4)

college_markers = folium.map.FeatureGroup()

for lat, lng, label in zip(df_usa.latitude, df_usa.longitude, df_usa.college_name):
    college_markers.add_child(
        folium.features.Marker(
            location=[lat, lng],
            tooltip=label,
        )
    )

# add incidents to map
USAmap.add_child(college_markers)

In [None]:
usa_map.choropleth(
    geo_data=world_geo,
    data=df_can,
    columns=['Country', 'Total'],
    key_on='feature.properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Immigration to Canada'
)

# display map
world_map