In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import universities
import googlemaps
import pprint

In [5]:
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 999

# Importing data retrieved from Chronicle of Higher Education

In [6]:
df = pd.read_csv("che-0629.csv")
df.head()

Unnamed: 0,Institution,Control,State,X.1
0,Abilene Christian University,Private,TX,"<a href=""https://www.acu.edu/coronavirus/april-29-2020-news-release-about-plans-to-reopen-campus-in-fall-2020.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
1,Academy of Art University,Private,CA,"<a href=""https://youtu.be/xxsrSGINzEU"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
2,Adelphi University,Private,NY,"<a href=""https://news.adelphi.edu/au_news/adelphis-updated-plans-for-fall-2020/"" target=""_blank"" rel=""nofollow noopener noreferrer"">Proposing a hybrid model</a>"
3,Adrian College,Private,MI,"<a href=""http://adrian.edu/news/ac-president-docking-says-campus-will-be-open-in-fall"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
4,Agnes Scott College,Private,GA,"<a href=""https://www.agnesscott.edu/coronavirus/updates/covid19-task-force-05-01.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"


# Inspecting the dataset

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069 entries, 0 to 1068
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Institution  1068 non-null   object
 1   Control      1066 non-null   object
 2   State        1065 non-null   object
 3   X.1          1066 non-null   object
dtypes: object(4)
memory usage: 33.5+ KB


###    Dropping data with NaN values

In [8]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,Institution,Control,State,X.1
388,"target=_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",,,
510,"target=_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",,,
520,"campus-college-experience-in-fall-2020"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",,,
1044,,Private,,"<a href=""https://willamette.edu/news/library/2020/04/message-to-prospective-students.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"


In [9]:
df.dropna(how = "any", inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1065 entries, 0 to 1068
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Institution  1065 non-null   object
 1   Control      1065 non-null   object
 2   State        1065 non-null   object
 3   X.1          1065 non-null   object
dtypes: object(4)
memory usage: 41.6+ KB


### Inpsecting variable "Control"

    Variable "control" contains typo and missing value =>> action fixing the typo and delete missing value

In [10]:
df["Control"].unique()

array(['Private', 'Public', '#REF!', 'Public ', 'Private ', 'public'],
      dtype=object)

    Remove spacing in control duo to typo

In [14]:
df["Control"] = df["Control"].str.capitalize().str.strip()
df["Control"].unique()

array(['Private', 'Public', '#ref!'], dtype=object)

    Removing missing value/#REF!

In [16]:
df[df["Control"].isin([np.nan,"#ref!"])]

Unnamed: 0,Institution,Control,State,X.1
32,#REF!,#ref!,#REF!,#REF!
464,#REF!,#ref!,#REF!,#REF!
1047,#REF!,#ref!,#REF!,#REF!
1048,#REF!,#ref!,#REF!,#REF!
1049,#REF!,#ref!,#REF!,#REF!


In [17]:
control_href = df[df["Control"].isin([np.nan,"#ref!"])].index
df.drop(axis = 0, index = control_href, inplace = True)
df["Control"].unique()

array(['Private', 'Public'], dtype=object)

### Inspecting Duplication entries in "Institution"

In [18]:
df[df["Institution"].duplicated(keep = False)]

Unnamed: 0,Institution,Control,State,X.1
81,Bryn Mawr College,Private,PA,"<a href=""https://www.brynmawr.edu/news/presidents-bryn-mawr-and-haverford-announce-plan-fall-semester"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
82,Bryn Mawr College,Private,PA,"<a href=""https://www.brynmawr.edu/news/presidents-bryn-mawr-and-haverford-announce-plan-fall-semester"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
182,Columbia College,Private,SC,"<a href=""https://www.columbiasc.edu/president-mitchell-announces-fall-2020-opening"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
183,Columbia College,Private,MO,"<a href=""https://www.columbiamissourian.com/news/covid19/columbia-college-to-resume-on-campus-classes-aug-31/article_abb52cdc-a1c1-11ea-a244-0711d29238c9.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
251,Embry-Riddle Aeronautical University,Private,AZ,"<a href=""https://news.erau.edu/headlines/embry-riddle-aeronautical-university-to-resume-face-to-face-instruction-on-june-30"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
252,Embry-Riddle Aeronautical University,Private,FL,"<a href=""https://news.erau.edu/headlines/embry-riddle-aeronautical-university-to-resume-face-to-face-instruction-on-june-30"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
301,Graceland University,Private,IA,"<a href=""https://www.graceland.edu/covid-19"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
302,Graceland University,Private,MO,"<a href=""https://www.graceland.edu/covid-19"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"
333,Hofstra University,Private,NY,"<a href=""https://news.hofstra.edu/2020/05/13/university-coronavirus-update/"" target=""_blank"" rel=""nofollow noopener noreferrer"">Considering a range of scenarios</a>"
334,Hofstra University,Private,NY,"<a href=""https://news.hofstra.edu/2020/06/05/university-coronavirus-update-7/"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>"


    There are institutions with multiple locations =>> Action: Keep
    For those entries sharing the same institution name and location =>> Action: Delete

In [29]:
duplication = df[df["Institution"].duplicated(keep = False)]
duplication[duplication.duplicated(keep=False)]
df.drop(axis = 0, index = duplication.duplicated(keep = "last").index,inplace = True)
df[df.duplicated(keep=False)]

Unnamed: 0,Institution,Control,State,X.1


### Converting X.1 into Plan, removing Links to get schools' plan

In [33]:
category = np.array([])
for i in df["X.1"]:
    if i.find("</a>")!=-1:
        category = np.append(category, i.split(">")[1][:-3])
# category = df.Category.str.split(">").str[1].str[:-3]
    else:
        category = np.append(category,i)
category
df["plan"] = category
system = df[df.plan.isin(["Link",np.nan])]
df.drop(system.index,axis = 0,inplace = True)
df.plan.unique()

array(['Planning for in-person', 'Proposing a hybrid model',
       'Planning for online', 'Considering a range of scenarios',
       'Waiting to decide',
       '<a href="https://www.ltu.edu/news/?_from=/news/index.asp&amp;_opt=detail&amp;_cid=f073ac3b-a1c4-4ee1-9777-7fd3299d40f1',
       '<a href="https://www.mcall.com/news/education/mc-nws-bethlehem-moravian-coronavirus-fall-semester-20200528-hryxysrsxzafvca7l7fgponile-story.html',
       '<a href="https://www2.naz.edu/news/archive/2020/May/592/nazareth-college-committed-to-full-on-'],
      dtype=object)

In [42]:
non_plan=df[~df.plan.isin(['Planning for in-person', 'Proposing a hybrid model',
       'Planning for online', 'Waiting to decide',
       'Considering a range of scenarios'])]
df.drop(axis = 0, index = non_plan.index, inplace = True)
df = df.rename(columns = {"X.1":"Link"})
df.plan.unique()

array(['Planning for in-person', 'Proposing a hybrid model',
       'Planning for online', 'Considering a range of scenarios',
       'Waiting to decide'], dtype=object)

In [43]:
df.head()

Unnamed: 0,Institution,Control,State,Link,plan
0,Abilene Christian University,Private,TX,"<a href=""https://www.acu.edu/coronavirus/april-29-2020-news-release-about-plans-to-reopen-campus-in-fall-2020.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person
1,Academy of Art University,Private,CA,"<a href=""https://youtu.be/xxsrSGINzEU"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person
2,Adelphi University,Private,NY,"<a href=""https://news.adelphi.edu/au_news/adelphis-updated-plans-for-fall-2020/"" target=""_blank"" rel=""nofollow noopener noreferrer"">Proposing a hybrid model</a>",Proposing a hybrid model
3,Adrian College,Private,MI,"<a href=""http://adrian.edu/news/ac-president-docking-says-campus-will-be-open-in-fall"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person
4,Agnes Scott College,Private,GA,"<a href=""https://www.agnesscott.edu/coronavirus/updates/covid19-task-force-05-01.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person


# Getting Longitute and latitude of school for Geospatial Analysis

In [44]:
def get_geo(data):
    ggmaps = googlemaps.Client(key ="AIzaSyAXZ3PAfgtLt5BR5Bn13ysNJHqDOo9Zd3w")
    place_result = ggmaps.find_place(input =data, input_type = "textquery",fields = ['geometry/location'])
    return place_result

In [45]:
def clean_result(data):
    lat = np.array([])
    long = np.array([])
    for i in data:
        if i.get("status") == "OK":
            geo =str(i.get("candidates")[0]).split(sep ="location': {")[1]
            lat = np.append(lat,geo.split(",")[0].split(sep = ":")[1].strip())
            long = np.append(long,geo.split(",")[1].split(sep = ":")[1].split("}")[0].strip())
        else:
            lat = np.append(lat,np.nan)
            long = np.append(long,np.nan)
    return lat,long

In [46]:
result = df.Institution.apply(get_geo)

In [47]:
result.head()

0     {'candidates': [{'geometry': {'location': {'lat': 32.469732, 'lng': -99.70809799999999}}}], 'status': 'OK'}
1          {'candidates': [{'geometry': {'location': {'lat': 37.7877247, 'lng': -122.4006621}}}], 'status': 'OK'}
2           {'candidates': [{'geometry': {'location': {'lat': 40.7198865, 'lng': -73.6522537}}}], 'status': 'OK'}
3    {'candidates': [{'geometry': {'location': {'lat': 41.8987054, 'lng': -84.05924139999999}}}], 'status': 'OK'}
4    {'candidates': [{'geometry': {'location': {'lat': 33.7685056, 'lng': -84.29453459999999}}}], 'status': 'OK'}
Name: Institution, dtype: object

In [48]:
lat, long = clean_result(result)

In [21]:
# api_key="AIzaSyAXZ3PAfgtLt5BR5Bn13ysNJHqDOo9Zd3w"
# ggmaps = googlemaps.Client(key=api_key)
# place_result= ggmaps.find_place(input = "College of Alameda", input_type = "textquery", fields = ['geometry/location'])
# print(place_result)

In [49]:
df["Lat"] = lat
df["Long"] = long
df.head()

Unnamed: 0,Institution,Control,State,Link,plan,Lat,Long
0,Abilene Christian University,Private,TX,"<a href=""https://www.acu.edu/coronavirus/april-29-2020-news-release-about-plans-to-reopen-campus-in-fall-2020.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person,32.469732,-99.708098
1,Academy of Art University,Private,CA,"<a href=""https://youtu.be/xxsrSGINzEU"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person,37.7877247,-122.4006621
2,Adelphi University,Private,NY,"<a href=""https://news.adelphi.edu/au_news/adelphis-updated-plans-for-fall-2020/"" target=""_blank"" rel=""nofollow noopener noreferrer"">Proposing a hybrid model</a>",Proposing a hybrid model,40.7198865,-73.6522537
3,Adrian College,Private,MI,"<a href=""http://adrian.edu/news/ac-president-docking-says-campus-will-be-open-in-fall"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person,41.8987054,-84.05924139999999
4,Agnes Scott College,Private,GA,"<a href=""https://www.agnesscott.edu/coronavirus/updates/covid19-task-force-05-01.html"" target=""_blank"" rel=""nofollow noopener noreferrer"">Planning for in-person</a>",Planning for in-person,33.7685056,-84.29453459999999


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1021 entries, 0 to 1068
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Institution  1021 non-null   object
 1   Control      1021 non-null   object
 2   State        1021 non-null   object
 3   Link         1021 non-null   object
 4   plan         1021 non-null   object
 5   Lat          1021 non-null   object
 6   Long         1021 non-null   object
dtypes: object(7)
memory usage: 63.8+ KB


In [51]:
df.to_csv("cleaned_che_0629.csv")