### Getting the Image Data

In [None]:
"""
This cell completes the necessary web scarping to obtain our randomized street
view dataset, gathering street images and their associated zip codes
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv

#Create list to store zip codes
zip_list = []

#Initialize driver
url = 'https://randomstreetview.com/us'
DRIVER = 'chromedriver'
driver = webdriver.Chrome(DRIVER)
num_images = 100

#generate 10 random locations
for i in range(num_images):
    #get random street view webpage
    driver.get(url)
    time.sleep(3)
    #grab screenshot of page
    screenshot = driver.save_screenshot(f'project/images/location{i}.png')
    #scrape address
    address = driver.find_element(By.ID, 'address').text
    #isolate zipcode and save to dictionary
    zipcode = address.split(',')[-2][-5:]
    zip_list.append([i, zipcode])

driver.quit()

#write zip codes to csv
with open('project/zipcodes.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    for line in zip_list:
        writer.writerow(line)

ModuleNotFoundError: ignored

In [None]:
"""
This cell processes the scraped street view images. Cropping them to remove
the website UI and then resizing them to a 250x170x3 JPEG image
"""

from PIL import Image
from os import listdir

folder = 'drive/MyDrive/Class/CS209b/CS209b_Final_Project/trial_data/croppedimages/' #'project/images'
counter = 0

for image in listdir(folder):
    im = Image.open(f'{image}') #'project/images/
    #crop image to remove site UI
    cropped = im.crop((500, 100, 2400, 1400))
    #resize to 250x170 for CNN
    resized = cropped.resize((250,170))
    #convert to RGB
    rgb = resized.convert('RGB')
    path = f'project/croppedimages/location{counter}.jpg'
    #save new image
    rgb.save(path)
    counter += 1

FileNotFoundError: ignored

### Processing Image Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
daniela_path = 'drive/MyDrive/Class/CS209b/CS209b_Final_Project'
path = daniela_path

In [None]:
#Future data preprocessing happens here

In [None]:
#Define inception block for GoogleNet

def inception_block(input_tensor, filters):
    # Branch 1
    branch1 = Conv2D(filters[0], (1,1), activation='relu')(input_tensor)
    
    # Branch 2
    branch2 = Conv2D(filters[1], (1,1), activation='relu')(input_tensor)
    branch2 = Conv2D(filters[2], (3,3), padding='same', activation='relu')(branch2)
    
    # Branch 3
    branch3 = Conv2D(filters[3], (1,1), activation='relu')(input_tensor)
    branch3 = Conv2D(filters[4], (5,5), padding='same', activation='relu')(branch3)
    
    # Branch 4
    branch4 = MaxPool2D((3,3), strides=(1,1), padding='same')(input_tensor)
    branch4 = Conv2D(filters[5], (1,1), activation='relu')(branch4)
    
    # Concatenate branches
    output_tensor = concatenate([branch1, branch2, branch3, branch4], axis=-1)
    
    return output_tensor

In [None]:
#Define auxiliery block with binary classification output

def auxiliary_block(input_tensor, aux_name):
    X = AveragePooling2D(pool_size = (5,5), strides = (3,3), padding='same')(input_tensor)
    X = Conv2D(filters = 128, kernel_size = (1,1), padding = 'same', activation = 'relu')(X)
    X = Flatten()(X)
    X = Dense(1024, activation = 'relu')(X)
    X = Dropout(0.7)(X)
    output_tensor = Dense(2, activation = 'binary_crossentropy', name = aux_name)(X)

    return output_tensor

In [None]:
#Define GoogleNet with binary classification output

def GoogleNet(input_shape):
    inp = Input(shape=input_shape)

    X = Conv2D(64, (7,7), strides=(2,2), padding='same', activation='relu')(inp)
    X = MaxPool2D((3,3), strides=(2,2), padding='same')(X)
    X = Conv2D(64, (1,1), activation='relu')(X)
    X = Conv2D(192, (3,3), padding='same', activation='relu')(X)
    X = MaxPool2D((3,3), strides=(2,2), padding='same')(X)
    
    # First Inception blocks (x2)
    X = inception_block(X, [64, 96, 128, 16, 32, 32])
    X = inception_block(X, [128, 128, 192, 32, 96, 64])
    X = MaxPool2D((3,3), strides=(2,2), padding='same')(X)

    # Second Inception blocks (x5)
    X = inception_block(X, [192, 96, 208, 16, 48, 64])
    X = inception_block(X, [160, 112, 224, 24, 64, 64])
    X1 = auxiliary_block(X, aux_name = 'aux_output1') # First Auxiliary block

    X = inception_block(X, [128, 128, 256, 24, 64, 64])
    X = inception_block(X, [112, 144, 288, 32, 64, 64])
    X = inception_block(X, [256, 160, 320, 32, 128, 128])
    X2 = auxiliary_block(X, aux_name = 'aux_output2') # Second Auxiliary block

    # Third Inception blocks (x2)
    X = MaxPool2D((3,3), strides=(2,2), padding='same')(X)
    X = inception_block(X, [256, 160, 320, 32, 128, 128])
    X = inception_block(X, [384, 192, 384, 48, 128, 128])
    X = GlobalAvgPool2D()(X)
    X = Dense(2, activation='binary_crossentropy', name='main_output')(X)
    model = Model(inputs=inp, outputs=[X, X1, X2])

    return model

In [None]:
import pandas as pd
y = pd.read_csv(path + "/outcome.csv")

In [None]:
trial_data = pd.read_csv(path + "/trial_data/trial.csv")

In [None]:
y = 

In [None]:
trial_data['Congressional District']

Unnamed: 0,ID,ZIP,Latitude,Longitude,Accuracy Type,City,State,County,Zip,Country,...,Full FIPS (tract),Metro/Micro Statistical Area Name,Metro/Micro Statistical Area Code,Metro/Micro Statistical Area Type,Combined Statistical Area Name,Combined Statistical Area Code,Metropolitan Division Area Name,Metropolitan Division Area Code,Congressional District,Congress
0,0,11940,40.80923,-72.75964,place,East Moriches,NY,Suffolk County,11940,US,...,36103159602,"New York-Newark-Jersey City, NY-NJ-PA",35620.0,metropolitan,"New York-Newark, NY-NJ-CT-PA",408.0,"Nassau County-Suffolk County, NY",35004.0,NY2,118th
1,1,46072,40.311418,-86.063107,place,Tipton,IN,Tipton County,46072,US,...,18159020402,,,,,,,,IN5,118th
2,2,80108,39.445502,-104.852987,place,Castle Rock,CO,Douglas County,80108,US,...,8035014016,"Denver-Aurora-Lakewood, CO",19740.0,metropolitan,"Denver-Aurora, CO",216.0,,,CO4,118th
3,3,98004,47.615471,-122.207221,place,Bellevue,WA,King County,98004,US,...,53033024002,"Seattle-Tacoma-Bellevue, WA",42660.0,metropolitan,"Seattle-Tacoma, WA",500.0,"Seattle-Bellevue-Kent, WA",42644.0,WA1,118th
4,4,67054,37.6028,-99.29261,place,Greensburg,KS,Kiowa County,67054,US,...,20097969100,,,,,,,,KS4,118th


In [None]:
def add_image(img_count):
    im = Image.open(f'location{img_count}.jpg') #'project/images/
    return (im, y.loc[img_count].DEM)
  
train_df = map(add_image, listdir(folder))