# You can find this on Backroads - Making Maps with Python NACIS 2019
#### This repository is an exploration of mapping capabilities in the python ecosystem.

<details>
    <summary><strong>Goal</strong></summary>
    "The goal of this notebook is to ..."
    <ul>
        <li> measurable goals for this notebook </li>
        <li> Ex 1: Identify <i>variables of interest in dataset <strong>name.dta</strong></i> </li>
        <li> Ex 2: Create a <i>working dataset</i> from raw data.</li>
    </ul>
</details>

<details>
    <summary><strong>Context</strong></summary>
    Context at the moment (i.e. "We've downloaded raw data from <strong><i>website.com</i></strong> and are now in the process of creating a mastefile.")
    <li> What is the final goal of this project?</li>
    <li> What are we trying to recreate? Where are we in the process?</li>
    <li> Any links to documentation / figures.</li>
</details>

In [1]:
import pandas as pd
from zipfile import ZipFile
from pathlib import Path
from tools import tree
from datetime import datetime as dt
today = dt.today().strftime("%d-%b-%y")

today

'03-Oct-19'

In [4]:
RAW_DATA = Path("../data/raw/")
INTERIM_DATA = Path("../data/interim/")
PROCESSED_DATA = Path("../data/processed/")
EXTERNAL_DATA = Path("../data/external/")
FINAL_DATA = Path("../data/final/")

In [5]:
tree(EXTERNAL_DATA)

+ ..\data\external
    + Data_10-3-2019---695.csv
    + ValueLabels_10-3-2019---695.csv


In [43]:
data = pd.read_csv(EXTERNAL_DATA / 'Data_10-3-2019---695.csv')

In [44]:
data.head()

Unnamed: 0,UnitID,Institution Name,FIPS state code (HD2017),Sector of institution (HD2017),Degree-granting status (HD2017),Institution size category (HD2017),Institution (entity) name (HD2017),Institution name alias (HD2017),Street address or post office box (HD2017),City location of institution (HD2017),...,Undergraduate enrollment (DRVEF2017),Graduate enrollment (DRVEF2017),Number of students receiving a Doctor's degree (DRVC2017),Number of students receiving a Master's degree (DRVC2017),Number of students receiving a Bachelor's degree (DRVC2017),Number of students receiving an Associate's degree (DRVC2017),Number of students receiving a Postbaccalaureate or Post-master's certificate (DRVC2017),Number of students receiving a certificate of 1 but less than 4-years (DRVC2017),Number of students receiving a certificate of less than 1-year (DRVC2017),Unnamed: 24
0,177834,A T Still University of Health Sciences,29,2,1,2,A T Still University of Health Sciences,,800 W Jefferson,Kirksville,...,0.0,3723.0,763,283,0,0,120,0,0,
1,222178,Abilene Christian University,48,2,1,3,Abilene Christian University,,,Abilene,...,3666.0,1479.0,3,256,743,3,11,0,1,
2,138558,Abraham Baldwin Agricultural College,13,1,1,2,Abraham Baldwin Agricultural College,ABAC,2802 Moore Hwy,Tifton,...,3394.0,0.0,0,0,206,389,0,0,0,
3,488031,Abraham Lincoln University,6,3,1,1,Abraham Lincoln University,,3530 Wilshire Boulevard Suite 1430,Los Angeles,...,30.0,167.0,17,6,2,0,0,0,0,
4,172866,Academy College,27,3,1,1,Academy College,,1600 W. 82nd Street Suite 100,Bloomington,...,65.0,0.0,0,0,6,16,0,4,27,


In [45]:
col_names = data.columns.values

In [46]:
cols_names = [col.replace(' (HD2017)', '').replace(' (DRVC2017)', '').replace(' (DRVEF2017)', '') for col in col_names ]

In [47]:
cols_names

['UnitID',
 'Institution Name',
 'FIPS state code',
 'Sector of institution',
 'Degree-granting status',
 'Institution size category',
 'Institution (entity) name',
 'Institution name alias',
 'Street address or post office box',
 'City location of institution',
 'State abbreviation',
 'Fips County code',
 'County name',
 'Longitude location of institution',
 'Latitude location of institution',
 'Undergraduate enrollment',
 'Graduate enrollment',
 "Number of students receiving a Doctor's degree",
 "Number of students receiving a Master's degree",
 "Number of students receiving a Bachelor's degree",
 "Number of students receiving an Associate's degree",
 "Number of students receiving a Postbaccalaureate or Post-master's certificate",
 'Number of students receiving a certificate of 1 but less than 4-years',
 'Number of students receiving a certificate of less than 1-year',
 'Unnamed: 24']

In [48]:
cols_clean = [
    col.replace("Doctor's degree","PhD")
    .replace("Master's degree","MA")
    .replace("Bachelor's degree","BA")
    .replace("Associate's degree","AA")
    .replace(" location of institution","")
    .replace("Number of students", "Students")
 for col in cols_names
]

In [49]:
data.columns = cols_clean

In [50]:
data.head()

Unnamed: 0,UnitID,Institution Name,FIPS state code,Sector of institution,Degree-granting status,Institution size category,Institution (entity) name,Institution name alias,Street address or post office box,City,...,Undergraduate enrollment,Graduate enrollment,Students receiving a PhD,Students receiving a MA,Students receiving a BA,Students receiving an AA,Students receiving a Postbaccalaureate or Post-master's certificate,Students receiving a certificate of 1 but less than 4-years,Students receiving a certificate of less than 1-year,Unnamed: 24
0,177834,A T Still University of Health Sciences,29,2,1,2,A T Still University of Health Sciences,,800 W Jefferson,Kirksville,...,0.0,3723.0,763,283,0,0,120,0,0,
1,222178,Abilene Christian University,48,2,1,3,Abilene Christian University,,,Abilene,...,3666.0,1479.0,3,256,743,3,11,0,1,
2,138558,Abraham Baldwin Agricultural College,13,1,1,2,Abraham Baldwin Agricultural College,ABAC,2802 Moore Hwy,Tifton,...,3394.0,0.0,0,0,206,389,0,0,0,
3,488031,Abraham Lincoln University,6,3,1,1,Abraham Lincoln University,,3530 Wilshire Boulevard Suite 1430,Los Angeles,...,30.0,167.0,17,6,2,0,0,0,0,
4,172866,Academy College,27,3,1,1,Academy College,,1600 W. 82nd Street Suite 100,Bloomington,...,65.0,0.0,0,0,6,16,0,4,27,


In [51]:
cols_to_replace = {
    'Street address or post office box': 'Address',
    'FIPS state code': 'FIPS state',
    'Fips County code': 'FIPS county',
    'Sector of institution': 'Sector'
}
    
data = data.rename(columns = cols_to_replace)
data.head()

Unnamed: 0,UnitID,Institution Name,FIPS state,Sector,Degree-granting status,Institution size category,Institution (entity) name,Institution name alias,Address,City,...,Undergraduate enrollment,Graduate enrollment,Students receiving a PhD,Students receiving a MA,Students receiving a BA,Students receiving an AA,Students receiving a Postbaccalaureate or Post-master's certificate,Students receiving a certificate of 1 but less than 4-years,Students receiving a certificate of less than 1-year,Unnamed: 24
0,177834,A T Still University of Health Sciences,29,2,1,2,A T Still University of Health Sciences,,800 W Jefferson,Kirksville,...,0.0,3723.0,763,283,0,0,120,0,0,
1,222178,Abilene Christian University,48,2,1,3,Abilene Christian University,,,Abilene,...,3666.0,1479.0,3,256,743,3,11,0,1,
2,138558,Abraham Baldwin Agricultural College,13,1,1,2,Abraham Baldwin Agricultural College,ABAC,2802 Moore Hwy,Tifton,...,3394.0,0.0,0,0,206,389,0,0,0,
3,488031,Abraham Lincoln University,6,3,1,1,Abraham Lincoln University,,3530 Wilshire Boulevard Suite 1430,Los Angeles,...,30.0,167.0,17,6,2,0,0,0,0,
4,172866,Academy College,27,3,1,1,Academy College,,1600 W. 82nd Street Suite 100,Bloomington,...,65.0,0.0,0,0,6,16,0,4,27,


In [52]:
data.head(2).T

Unnamed: 0,0,1
UnitID,177834,222178
Institution Name,A T Still University of Health Sciences,Abilene Christian University
FIPS state,29,48
Sector,2,2
Degree-granting status,1,1
Institution size category,2,3
Institution (entity) name,A T Still University of Health Sciences,Abilene Christian University
Institution name alias,,
Address,800 W Jefferson,
City,Kirksville,Abilene


In [53]:
data = data.drop(columns = ['Degree-granting status', 'Institution name alias', "Students receiving a Postbaccalaureate or Post-master's certificate", 'Students receiving a certificate of 1 but less than 4-years', 'Students receiving a certificate of less than 1-year',  'Unnamed: 24'])

In [54]:
data.head(2).T

Unnamed: 0,0,1
UnitID,177834,222178
Institution Name,A T Still University of Health Sciences,Abilene Christian University
FIPS state,29,48
Sector,2,2
Institution size category,2,3
Institution (entity) name,A T Still University of Health Sciences,Abilene Christian University
Address,800 W Jefferson,
City,Kirksville,Abilene
State abbreviation,MO,TX
FIPS county,29001,48441


In [55]:
labels = pd.read_csv("../data/external/ValueLabels_10-3-2019---695.csv")

In [61]:
labels[labels['VariableName'].str.contains("Sector")]

Unnamed: 0,VariableName,Value,ValueLabel
102,Sector of institution (HD2017),1,"Public, 4-year or above"
103,Sector of institution (HD2017),2,"Private not-for-profit, 4-year or above"
104,Sector of institution (HD2017),3,"Private for-profit, 4-year or above"


In [62]:
size_cat = {
    1: 'Under 1,000',
    2: '1,000 - 4,999',
    3: '5,000 - 9,999',
    4: '10,000 - 19,999', 
    5: '20,000 and above',
}
sector_cat = {
    1: "Public",
    2: "PnP",
    3: "PfP",
}

In [64]:
data['Sector'] = data['Sector'].map(sector_cat)

In [65]:
data['Institution size category'] = data['Institution size category'].map(size_cat)

In [67]:
data = data.rename(columns = {"Institution size category": "Size"})

In [68]:
data.head()

Unnamed: 0,UnitID,Institution Name,FIPS state,Sector,Size,Institution (entity) name,Address,City,State abbreviation,FIPS county,County name,Longitude,Latitude,Undergraduate enrollment,Graduate enrollment,Students receiving a PhD,Students receiving a MA,Students receiving a BA,Students receiving an AA
0,177834,A T Still University of Health Sciences,29,PnP,"1,000 - 4,999",A T Still University of Health Sciences,800 W Jefferson,Kirksville,MO,29001,Adair County,-92.589183,40.193648,0.0,3723.0,763,283,0,0
1,222178,Abilene Christian University,48,PnP,"5,000 - 9,999",Abilene Christian University,,Abilene,TX,48441,Taylor County,-99.709797,32.468943,3666.0,1479.0,3,256,743,3
2,138558,Abraham Baldwin Agricultural College,13,Public,"1,000 - 4,999",Abraham Baldwin Agricultural College,2802 Moore Hwy,Tifton,GA,13277,Tift County,-83.528281,31.481889,3394.0,0.0,0,0,206,389
3,488031,Abraham Lincoln University,6,PfP,"Under 1,000",Abraham Lincoln University,3530 Wilshire Boulevard Suite 1430,Los Angeles,CA,6037,Los Angeles County,-118.301362,34.061398,30.0,167.0,17,6,2,0
4,172866,Academy College,27,PfP,"Under 1,000",Academy College,1600 W. 82nd Street Suite 100,Bloomington,MN,27053,Hennepin County,-93.299809,44.855722,65.0,0.0,0,0,6,16


In [69]:
data.to_csv("../data/processed/colleges_in_US.csv", encoding = 'utf-8', index = False)