In [34]:
import pandas as pd
import numpy as np
import os
import re
import pickle
from bs4 import BeautifulSoup as soup
import requests as rq
import time
import h5py
import scipy as sp
import scipy.stats

# EDA

## Pulling in Data

The data collected is a mix of HDF5, csv and web scraped data about the Bay area.  The CSVs are for particle concentraions, HDF5s for CO2 ppm, and the web scraped data is for empty lot/apartments for sale in the San Fransisco.

In [2]:
#starting off by reading in the csv files
aqcsvs = []
aqcsvs += ['airqualitycsvs/'+each for each in os.listdir('airqualitycsvs/') if each.endswith('.csv')]


#getting all H5 files for the project
results = []
results += ['../../../../Downloads/'+each for each in os.listdir('../../../../Downloads/') if each.endswith('.h5')]

In [3]:
aqcsvs

['airqualitycsvs/annual_conc_by_monitor_2018.csv',
 'airqualitycsvs/annual_conc_by_monitor_2017.csv',
 'airqualitycsvs/annual_conc_by_monitor_2016.csv',
 'airqualitycsvs/annual_conc_by_monitor_2014.csv',
 'airqualitycsvs/annual_conc_by_monitor_2015.csv']

In [4]:
aq= pd.read_csv(aqcsvs[0])
for i in range(1,len(aqcsvs)):
    aq.append(pd.read_csv(aqcsvs[i]))

In [5]:
#Filtering out unneed data
aq = aq[aq['State Name']=='California']
aq = aq[aq['CBSA Name']=='San Francisco-Oakland-Hayward, CA']
aq.reset_index(drop=True,inplace=True)
aq.drop(['State Code', 'County Code','State Name','City Name','County Name', 'Datum'], axis=1)

Unnamed: 0,Site Num,Parameter Code,POC,Latitude,Longitude,Parameter Name,Sample Duration,Pollutant Standard,Metric Used,Method Name,...,98th Percentile,95th Percentile,90th Percentile,75th Percentile,50th Percentile,10th Percentile,Local Site Name,Address,CBSA Name,Date of Last Change
0,7,42601,1,37.687526,-121.784217,Nitric oxide (NO),1 HOUR,,Observed Values,INSTRUMENTAL - CHEMILUMINESCENCE,...,54.200,36.600,22.900,7.900,1.100,-0.100,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-04-04
1,7,42602,1,37.687526,-121.784217,Nitrogen dioxide (NO2),1 HOUR,NO2 1-hour,Daily Maximum 1-hour average,INSTRUMENTAL - CHEMILUMINESCENCE,...,33.300,32.800,32.100,28.200,24.500,14.600,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-04-04
2,7,42602,1,37.687526,-121.784217,Nitrogen dioxide (NO2),1 HOUR,NO2 Annual 1971,Observed values,INSTRUMENTAL - CHEMILUMINESCENCE,...,30.200,27.100,24.900,18.700,10.200,2.800,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-04-04
3,7,42603,1,37.687526,-121.784217,Oxides of nitrogen (NOx),1 HOUR,,Observed Values,INSTRUMENTAL - CHEMILUMINESCENCE,...,79.800,58.900,45.700,28.700,11.600,3.000,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-04-04
4,7,43207,3,37.687526,-121.784217,Freon 113,24 HOUR,,Observed Values,SS 6L Pressurized Canister - Cryogenic Precon ...,...,0.148,0.148,0.148,0.148,0.144,0.128,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-05-10
5,7,43218,3,37.687526,-121.784217,"1,3-Butadiene",24 HOUR,,Observed Values,SS 6L Pressurized Canister - Cryogenic Precon ...,...,0.296,0.296,0.296,0.236,0.000,0.000,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-05-10
6,7,43302,3,37.687526,-121.784217,Ethyl alcohol,24 HOUR,,Observed Values,SS 6L Pressurized Canister - Cryogenic Precon ...,...,35.758,35.758,35.758,12.552,5.426,1.710,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-05-10
7,7,43551,3,37.687526,-121.784217,Acetone,24 HOUR,,Observed Values,SS 6L Pressurized Canister - Cryogenic Precon ...,...,23.724,23.724,23.724,16.761,14.907,5.706,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-05-10
8,7,43552,3,37.687526,-121.784217,Methyl ethyl ketone,24 HOUR,,Observed Values,SS 6L Pressurized Canister - Cryogenic Precon ...,...,2.152,2.152,2.152,1.496,1.108,0.000,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-05-10
9,7,43702,3,37.687526,-121.784217,Acetonitrile,24 HOUR,,Observed Values,SS 6L Pressurized Canister - Cryogenic Precon ...,...,0.216,0.216,0.216,0.000,0.000,0.000,Livermore,793 Rincon Ave.,"San Francisco-Oakland-Hayward, CA",2018-05-10


In [56]:
aq[(aq['City Name']=='Oakland')| (aq['City Name']=='Richmond')|(aq['City Name']=='San Pablo')|(aq['City Name']=='San Francisco')|(aq['City Name']==['San Rafael'])]


TypeError: unhashable type: 'list'

In [52]:
aq['City Name'].unique()

array(['Livermore', 'Oakland', nan, 'Concord', 'Richmond', 'Rodeo',
       'Crockett', 'Bethel Island', 'San Pablo', 'Martinez', 'San Rafael',
       'Lagunitas-Forest Knolls', 'San Francisco', 'Redwood City'],
      dtype=object)

In [22]:
def h5ToDF(file):
    f = h5py.File(file, 'r')
    date = [pd.to_datetime(str(x).replace('b\'','').replace('\'','')) for x in list(f['RetrievalHeader']['retrieval_time_string'])]
    latlong = [(x,y) for x,y in zip(f['RetrievalGeometry']['retrieval_latitude_geoid'],f['RetrievalGeometry']['retrieval_longitude_geoid'])]
    co2ppm = list(f['RetrievalResults']['xco2'])
    co2_uncert = [x*10**6 for x in list(f['RetrievalResults']['xco2_uncert'])]
    df = pd.DataFrame({'date':date,
        'latlong': latlong,
        'co2ppm': co2ppm,
        'co2_uncert': co2_uncert
    })
    return df
                                                                                    
                                                                                    
                                                                                    

In [23]:
df = f = h5py.File(results[0], 'r')
date = [pd.to_datetime(str(x).replace('b\'','').replace('\'','')) for x in list(f['RetrievalHeader']['retrieval_time_string'])]
latlong = [(x,y) for x,y in zip(f['RetrievalGeometry']['retrieval_latitude_geoid'],f['RetrievalGeometry']['retrieval_longitude_geoid'])]
co2ppm = list(f['RetrievalResults']['xco2'])
co2_uncert = [x*10**6 for x in list(f['RetrievalResults']['xco2_uncert'])]
df = pd.DataFrame({'date':date,
                'latlong': latlong,
                'co2ppm': co2ppm,
                'co2_uncert': co2_uncert})
for i in range(1,len(results)):
    df.append(h5ToDF(results[i]))

In [24]:
df['co2ppm'] = [x*10**6 for x in df['co2ppm']]

In [28]:
df['co2_uncert'] = [float(x) for x in df['co2_uncert']]

In [29]:
df.head()

Unnamed: 0,co2_uncert,co2ppm,date,latlong
0,0.886318,402.186939,2016-11-13 20:26:38.363,"(-83.56647, 38.90449)"
1,0.786782,398.468226,2016-11-13 20:26:38.390,"(-83.59027, 39.013676)"
2,0.780767,396.430114,2016-11-13 20:26:38.417,"(-83.61452, 39.12222)"
3,0.77743,398.074626,2016-11-13 20:26:38.562,"(-83.46712, 38.238407)"
4,0.7994,399.525161,2016-11-13 20:26:38.616,"(-83.51127, 38.46135)"


In [30]:
#Writing a pickle file to save data
import pickle
with open('my_data.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)

In [None]:
#list(f['RetrievalGeometry']['retrieval_latitude_geoid'])

## Web Scraping

Pulling data from loopnet.com for areas where we can greenify citys (make them more environmentally friendly). 

In [97]:
#initializing sites that I will pull from
flexspace = 'http://www.loopnet.com/california/san-francisco_flex-space-for-sale/'
land = 'http://www.loopnet.com/california/san-francisco_land-for-sale/'
apts = 'http://www.loopnet.com/california/san-francisco_apartment-buildings-for-sale/'

In [98]:
response = rq.get(land)
page = response.text
reviewp = soup(page,"lxml")


In [99]:
reviewp.find_all(class_='listingTitle')

[<span class="listingTitle">0.05 Acre Site for Proposed Apartments</span>,
 <span class="listingTitle">1228 Folsom Street</span>,
 <span class="listingTitle">1876 Oakdale Ave</span>,
 <span class="listingTitle">1791 Mission Street</span>,
 <span class="listingTitle">240 7th St</span>,
 <span class="listingTitle">1924 Mission St</span>,
 <span class="listingTitle">2435-2445 16th St</span>,
 <span class="listingTitle">Topaz Way</span>,
 <span class="listingTitle">Portfolio of 4 Properties</span>,
 <span class="listingTitle">1234 Shoreline Hwy</span>,
 <span class="listingTitle">81 Ervine St</span>,
 <span class="listingTitle">OCTAVIA M+N</span>,
 <span class="listingTitle">250 Church St</span>,
 <span class="listingTitle">Mission and 22nd</span>]

## Exploring

Exploring the data, kicking out points that are wrong based on the error and other stats.

In [33]:
print('Max: '+ str(max(df['co2_uncert'])))
print('Min: ' + str(min(df['co2_uncert'])))
print('Average: '+ str(np.average(df['co2_uncert'])))
print('Median: '+ str(np.median(df['co2_uncert'])))

Max: 13.534934623748995
Min: 0.1912240037427182
Average: 0.4512936642967768
Median: 0.4234372852351953


In [46]:
np.percentile(df['co2_uncert'], 99.5)

1.041737527884833

In [51]:
#Removing Outliers because of large error associated with the calculation
df= (df[df['co2_uncert']<1.04])