In [271]:
from geopy.geocoders import GoogleV3
from shapely.geometry import Point
from shapely import wkt
import urllib.request
import pandas as pd
import numpy as np
import bs4 as bs
import warnings
import random
import gmplot 
import json

In [272]:
# Source: https://www.doogal.co.uk/polylines.php
polylines_df = pd.read_csv('polylines.csv')
polylines_list = polylines_df.values.tolist()
polylines_string = json.dumps(polylines_list).replace(",", "").replace("]]", "").replace("[", "").replace("]", ",")

multipolygon_df = pd.DataFrame({'geom': ['MULTIPOLYGON ((('+polylines_string+')))']})
multipolygon_df['geom'] = multipolygon_df['geom'].apply(wkt.loads)
multipolygon = multipolygon_df.iloc[0]['geom']

In [273]:
def generate_random(number, polygon):
    points = []
    minx, miny, maxx, maxy = polygon.bounds
    while len(points) < number:
        #Florida International University Latitude
        x = np.random.normal(25.7574, 0.20, 1)
        #Florida International University Longitude
        y = np.random.normal(-80.3733, 0.15, 1)
        pt = x, y
        pnt = Point(pt)
        if polygon.contains(pnt):
            points.append(pt)
    return(points)

# calls function to generate a specified number of (Latitude, Longitude) coordinates bounded by polygon  
coordinates = generate_random(20000, multipolygon)

In [274]:
# save coordinate data to csv
df = pd.DataFrame(coordinates)

df.to_csv("coordinates.csv", sep = ',', index = False, header = ['Latitude', 'Longitude'])

# Read and Load Data
data = pd.read_csv('coordinates.csv', sep = ',') 

# Formatting of Dataframe (remove [] and string -> float)
data['Latitude'] = data['Latitude'].str.strip('[]').astype(float)
data['Longitude'] = data['Longitude'].str.strip('[]').astype(float)

In [275]:
# Plot Central Location (FIU MMC CAMPUS)
gmap = gmplot.GoogleMapPlotter(25.7574, -80.3733, 15)
gmap.marker(25.7574, -80.3733, color = 'red')

# Loop over and Mark Lattitude and Longitude Coordinates in Map
for latitude, longitude in data.itertuples(index = False):
     gmap.marker(latitude, longitude, color = 'cornflowerblue')

# Google API Key
googlekey = "AIzaSyB8Qm6takzPBAxsfqUXclRf5Bc81ZDLuuU"        
        
# Google API Key    
gmap.apikey = googlekey

# Save Generated Map to a Given Directory 
gmap.draw('map.html')

In [None]:
geolocator = GoogleV3(api_key = googlekey)

# Converts coordinates to nearest address calling Google's Geocode API
addresses = []
for i in range(len(coordinates)):
    location = geolocator.reverse(coordinates[i])
    string = json.dumps(location.raw)
    jsondata = json.loads(string)
    addresses.append(jsondata)

In [240]:
# Filters the different locations
filter_types = []

for i in range(len(addresses)):
    if addresses[i]['types'][0] in ['street_address', 'premise']:
        filter_types.append(addresses[i])  

In [241]:
# Parses the json structure to format 
formatted_data = []

for i in range(len(filter_types)):
    formatting = {
                'place_id': filter_types[i]['place_id'],
                'formatted_address': filter_types[i]['formatted_address'],
                'latitude': filter_types[i]['geometry']['location']['lat'],
                'longitude': filter_types[i]['geometry']['location']['lng']
                 }
    formatted_data.append(formatting)

df = pd.DataFrame(formatted_data)

In [242]:
# This block eliminates duplicate addresses
placeid_list = []
for i in range(len(filter_types)):
    placeid_list.append(filter_types[i]['place_id'])
    
unique_list = [] 
for i in placeid_list:   
    if i not in unique_list: 
        unique_list.append(i) 

unique_df = pd.DataFrame(unique_list)
unique_df.rename(columns={0:'place_id'}, inplace = True)

df = pd.concat([df, unique_df], axis=1, join='inner')
df = df.iloc[:, : 4]

In [243]:
# Web Scrape current values and demographics for FIU 
source = urllib.request.urlopen('https://www.collegetuitioncompare.com/edu/133951/florida-international-university/enrollment/').read()
soup = bs.BeautifulSoup(source, 'lxml')

table = soup.table
table_rows = table.find_all('tr')

total = int((table_rows[1].find_all('td')[0].text).replace(",", ""))
women = int((table_rows[3].find_all('td')[0].text).replace(",", ""))

frac_women = women / total
# This split handles the women
first_split = df.sample(frac = frac_women, random_state = 200)

# This split handles the remaining (men)
second_split = df.drop(first_split.index)

# Assign genders based on proportion
first_split['gender'] = 'F'
second_split['gender'] = 'M'

df = first_split.append(second_split, sort = False)

In [244]:
# add index as column to dataframe (mimic of a student id)
df['sid'] = df.index

# hypothesized parameters
frac_student = 0.55
frac_faculty = 0.05
frac_support = 0.50

first_split_student = df.sample(frac = frac_student, random_state = 200)
second_split_faculty = df.drop(first_split_student.index).sample(frac = frac_faculty, random_state = 200)
third_split_support = df.drop(first_split_student.index).drop(second_split_faculty.index).sample(frac = frac_support, random_state = 200)
fourth_split_admin = df.drop(first_split_student.index).drop(second_split_faculty.index).drop(third_split_support.index)

# Define split with values
first_split_student['classification'] = 'Student'
second_split_faculty['classification'] = 'Faculty'
third_split_support['classification'] = 'Support'
fourth_split_admin['classification'] = 'Administrative'

# combine dataframe
df = first_split_student.append([second_split_faculty,third_split_support,fourth_split_admin],sort = False)

In [245]:
# Validation of hypothesized proportions
print('Student Proportion: {}'.format(round((len(df.loc[df['classification'] == 'Student']) / len(df)), 2))),
print('Faculty Proportion: {}'.format(round((len(df.loc[df['classification'] == 'Faculty']) / len(df)), 2))),
print('Support Proportion: {}'.format(round((len(df.loc[df['classification'] == 'Support']) / len(df)), 2))),
print('Administrative Proportion: {}'.format(round((len(df.loc[df['classification'] == 'Administrative']) / len(df)), 2)))

Student Proportion: 0.55
Faculty Proportion: 0.02
Support Proportion: 0.21
Administrative Proportion: 0.21


In [246]:
# Suppress FutureDeprecation Warning (coming from helper function blocks.py in lib of pandas)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Repeating ID's indicates household size
filtered = pd.DataFrame(df.groupby('place_id').filter(lambda g: len(g) > 1).groupby('place_id').size().sort_values(ascending=False))
filtered.columns = ['count']
filtered = filtered.reset_index()

# add household to revised_df and initialize to 0
df['household'] = 0

for index in range(len(filtered)):
    df.iloc[[df['place_id'] == filtered.iloc[index]['place_id']], [7]] = filtered.iloc[index]['count']        

# Remainder will be between [1, 4]
for index in range(len(df)):
    if df.iloc[index]['household'] == 0:
        df.iloc[[index], [7]] = random.randint(1, 4)
    else: 
        pass

In [247]:
# initiate/create age column
df['age'] = 0    

# students ages between [18, 25] 
# all others ages between [35, 60] 

for index in range(len(df)):
    # the 80% of students
    if df.loc[df.index[index]]['classification'] == 'Student':
        df.iloc[[index], [8]] = random.randint(18, 25)
    
    # everyone else (faculty, support, admin)       
    elif df.loc[df.index[index]]['classification'] == 'Faculty':
        df.iloc[[index], [8]] = random.randint(35, 60) 
    
    elif df.loc[df.index[index]]['classification'] == 'Support':
        df.iloc[[index], [8]] = random.randint(35, 60)     
    
    elif df.loc[df.index[index]]['classification'] == 'Administrative':
        df.iloc[[index], [8]] = random.randint(35, 60)   
    
    else:
        pass

In [248]:
# Customized .csv containing subject of courses and course abbreviations
coursedf = pd.read_csv('offeredcourses.csv').values.tolist()

# Generates 4 levels for each course abbreviation
catalog = []
for i in range(len(coursedf)):
    for j in range(1,5):
        if (i % 2) != 0:
            k=j+4
            generate = {'subject': coursedf[i][0],
                        'version': str(k),
                        'course':  coursedf[i][1]+str(j)}
            catalog.append(generate)
        else: 
            generate = {'subject': coursedf[i][0],
                        'version': str(j),
                        'course':  coursedf[i][1]+str(j)}
            catalog.append(generate)
# catalog of courses
catalogdf = pd.DataFrame(catalog)

# dataframe of unique subjects
subjectdf = pd.DataFrame(catalogdf['subject'].unique()).reset_index()
subjectdf.rename(columns = {'index':'ID'}, inplace = True)
subjectdf.ID = subjectdf['ID'] + 1
subjectdf.rename(columns={0:'subject'}, inplace = True)

In [249]:
# isolate faculty data from dataframe 
facultydf = df.loc[df['classification'] == 'Faculty']
facultydf = facultydf.reset_index()

# assigns random subject id for each faculty
randomdf = pd.Series(np.random.randint(1,20, size=(len(facultydf))))  
randomdf = randomdf.rename('ID')

# retrieves the subject name by linking subject id
result = pd.concat([facultydf, randomdf], axis=1)
mergedf = pd.merge(result, subjectdf, on='ID')

In [250]:
# assign courses to faculty based on subject
teach = []
for i in range(len(mergedf)):
    add = catalogdf['course'].loc[(catalogdf['subject'] == mergedf.iloc[i]['subject']) & (catalogdf['version'] == str(random.randrange(1,8)))]
    teach.append(add.values)
    
teachdf = pd.DataFrame(teach) 
teachdf.rename(columns={0:'course'}, inplace = True)

# new faculty dataframe with subject and courses they instruct
facultydf = pd.concat([mergedf, teachdf], axis=1)

In [251]:
# creates composite class column by combining teacher's id and the course they teach 
teachcoursedf = facultydf[['sid','subject' ,'course']]
teachcoursedf = teachcoursedf.reset_index()
teachcoursedf.rename(columns = {'index':'registerid'}, inplace = True)
teachcoursedf.registerid = teachcoursedf['registerid'] + 1

In [252]:
#creates a matrix of classes to assign to student as "registration"
registrymatrix = []
i = 0
j = 0
k = 1
x = 2
y = 3
while i != len(df.loc[df['classification'] == 'Student']):
    if j == len(teachcoursedf):
        j = 0 
    if k == len(teachcoursedf):
        k = 0
    if x == len(teachcoursedf):
        x = 0
    if y == len(teachcoursedf):
        y = 0
    registrymatrix.append([j + 1,k + 1, x + 1, y + 1])
    i = i + 1
    j = j + 1
    k = k + 1
    x = x + 1
    y = y + 1

registrymatrixdf = pd.DataFrame(registrymatrix)

In [253]:
studentdf = df.loc[df['classification'] == 'Student']
studentdf = studentdf.reset_index()    
    
registrymatrixdf = pd.DataFrame(registrymatrix)

frame1 = pd.concat([studentdf[['sid']], registrymatrixdf[[0]]], axis=1)
frame1.rename(columns={0:'courseid'}, inplace = True) 

frame2 = pd.concat([studentdf[['sid']], registrymatrixdf[[1]]], axis=1)
frame2.rename(columns={1:'courseid'}, inplace = True) 

frame3 = pd.concat([studentdf[['sid']], registrymatrixdf[[2]]], axis=1)
frame3.rename(columns={2:'courseid'}, inplace = True) 

frame4 = pd.concat([studentdf[['sid']], registrymatrixdf[[3]]], axis=1)
frame4.rename(columns={3:'courseid'}, inplace = True) 

# Puts the matrix in a tabular format to represent the classes students are registered for
finalframe = pd.concat([frame1, frame2, frame3, frame4], ignore_index=True, sort=False)

In [270]:
# Extract Zip Codes from Full Address
zipcodes = [i[-1] for i in df.formatted_address.str.split('FL ')]
for i in range(len(zipcodes)):
    zipcodes[i] = zipcodes[i].split(",")[0]
    
zipcodes = pd.DataFrame(zipcodes)
zipcodes.columns = ['zipcode']

# reset index and preserve structure
copy = df.reset_index().drop(['index'], axis=1)
# Append to full dataframe
df = pd.concat([copy, zipcodes], axis=1)

# Finally, Save the Results to csv file!
df.to_csv('people.csv')