In [1]:
import pandas as pd

In [2]:
#Falling Fruit Data from:
#https://www.fallingfruit.org/data?c=forager%2Cfreegan&locale=en
#used BZ2 to ZIP converter https://cloudconvert.com/bz2-to-zip

In [3]:
#import types data
types_df = pd.read_csv('Resources/types.csv')

In [4]:
#there are dumpsters listed on there and we don't want those
types_df = types_df[types_df['en_name'] != 'Dumpster (edible)']

In [5]:
##select fewer columns
types_df = types_df[['id', 'parent_id', 'scientific_name', 'en_name', 'en_wikipedia_url']]
types_df = types_df.rename(columns={'id': 'type_ids'})
types_df.dropna()

Unnamed: 0,type_ids,parent_id,scientific_name,en_name,en_wikipedia_url
0,1,285.0,Prunus,Plum,http://en.wikipedia.org/wiki/Plum
2,3,263.0,Citrus x sinensis,Orange,http://en.wikipedia.org/wiki/Citrus_sinensis
3,4,263.0,Citrus x limon,Lemon,http://en.wikipedia.org/wiki/Citrus_limon
4,5,263.0,Citrus x paradisi,Grapefruit,http://en.wikipedia.org/wiki/Citrus_paradisi
9,11,263.0,Citrus maxima,Pomelo,http://en.wikipedia.org/wiki/Citrus_maxima
12,14,114.0,Malus pumila,Apple,http://en.wikipedia.org/wiki/Malus_domestica
17,19,263.0,Citrus japonica,Kumquat,http://en.wikipedia.org/wiki/Citrus_japonica
18,20,445.0,Ficus carica,Common fig,http://en.wikipedia.org/wiki/Ficus_carica
19,23,263.0,Citrus reticulata,Mandarin,http://en.wikipedia.org/wiki/Mandarin_orange
21,25,1387.0,Citrus medica var. sarcodactyli,Buddha's hand,http://en.wikipedia.org/wiki/Buddha%27s_hand


In [6]:
#change type for merge
types_df['type_ids'] = types_df['type_ids'].astype('str')

In [7]:
#this file is huge, with data for the entire world
locations_df = pd.read_csv('Resources/locations.csv', low_memory=False)

In [8]:
#select necessary columns
locations_df = locations_df[['id', 'type_ids', 'lat', 'lng']]
#get latitude and longitude
locations_df = locations_df.rename(columns={'lng': 'lon'})
# Drop na values
locations_df.dropna()
locations_df.head()

Unnamed: 0,id,type_ids,lat,lon
0,22,3,37.409849,-122.137529
1,23,8,37.412087,-122.140182
2,24,4,37.412043,-122.1397
3,25,3,37.411562,-122.139288
4,26,4,37.411252,-122.138862


In [9]:
# Initial limit of Austin, TX dataset: NW 30.308660, -97.756688 and SE 30.261570, -97.736784
# Limits of final dataset should include all of Austin: NW 30.529060, -97.788274 and SE 30.030380, -97.668365

In [10]:
# Latitude filter
tst_locations_df = locations_df[locations_df['lat'].between(30.030380, 30.529060, inclusive=True)]

# Longitude filter
tst_locations_df = tst_locations_df[locations_df['lon'].between(-97.788274, -97.668365, inclusive=True)]

  """


In [11]:
tst_locations_df.count()

id          23192
type_ids    23191
lat         23192
lon         23192
dtype: int64

In [12]:
test_df = tst_locations_df.merge(types_df, how='inner', on='type_ids')
del test_df['id']
del test_df['type_ids']
del test_df['parent_id']
del test_df['en_wikipedia_url']
test_df['en_name']
test_df.set_index('scientific_name', inplace=True)

In [13]:
test_df = test_df[['en_name', 'lat', 'lon']]
test_df.head()

Unnamed: 0_level_0,en_name,lat,lon
scientific_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sapindus,Soapberry,30.228193,-97.757248
Allium,Onion,30.327974,-97.758644
Allium,Onion,30.249392,-97.713647
Allium,Onion,30.446912,-97.757219
Allium,Onion,30.288013,-97.763354


In [14]:
pecandf = test_df.loc[test_df['en_name'] == 'Pecan']
pecandf.head()

Unnamed: 0_level_0,en_name,lat,lon
scientific_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Carya illinoinensis,Pecan,30.224958,-97.751884
Carya illinoinensis,Pecan,30.242811,-97.749863
Carya illinoinensis,Pecan,30.257641,-97.75325
Carya illinoinensis,Pecan,30.264034,-97.696658
Carya illinoinensis,Pecan,30.258724,-97.705455


In [15]:
nonpecandf = test_df.loc[test_df['en_name'] != 'Pecan']
nonpecandf.head()

Unnamed: 0_level_0,en_name,lat,lon
scientific_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sapindus,Soapberry,30.228193,-97.757248
Allium,Onion,30.327974,-97.758644
Allium,Onion,30.249392,-97.713647
Allium,Onion,30.446912,-97.757219
Allium,Onion,30.288013,-97.763354


In [16]:
test_df.to_csv('fulldataset.csv')
pecandf.to_csv('pecandataset.csv')
nonpecandf.to_csv('nonpecandataset.csv')

In [77]:
import math

def haversine(coord1,coord2,coord3,coord4):
    
        lon1,lat1=coord1,coord2
        lon2,lat2=coord3,coord4

        R=6371000                               # radius of Earth in meters
        phi_1=math.radians(lat1)
        phi_2=math.radians(lat2)

        delta_phi=math.radians(lat2-lat1)
        delta_lambda=math.radians(lon2-lon1)

        a=math.sin(delta_phi/2.0)**2+\
           math.cos(phi_1)*math.cos(phi_2)*\
           math.sin(delta_lambda/2.0)**2
        c=2*math.atan2(math.sqrt(a),math.sqrt(1-a))

        meters=R*c                         # output distance in meters
        km=meters/1000.0              # output distance in kilometers
        miles=meters*0.000621371      # output distance in miles
        feet=miles*5280               # output distance in feet
        
        return feet

In [80]:
import numpy as np

In [82]:
lon_1 = -97.788274
lat_1 = 30.529060
lon_2 = -97.668365
lat_2 = 30.030380

vect_len = 10 ** 6

In [87]:
LON_1 = np.array([lon_1 for _ in range(vect_len)])
LAT_1 = np.array([lat_1 for _ in range(vect_len)])

LON_2 = np.array([lon_2 for _ in range(vect_len)])
LAT_2 = np.array([lat_2 for _ in range(vect_len)])

In [90]:
%%timeit
haversine_v = np.vectorize(haversine)

3.47 µs ± 174 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [89]:

haversine_v(LON_1, LAT_1, LON_2, LAT_2)

array([185805.44461884, 185805.44461884, 185805.44461884, ...,
       185805.44461884, 185805.44461884, 185805.44461884])

In [86]:
LONG_1.shape

(1000000,)

In [78]:
%%timeit

for i in range(0, 1_000_000):
    haversine(-97.788274,30.529060,-97.668365,30.030380)

3.88 s ± 338 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
result = []
for index, row in df.iterrows():
    result.append(row['date'])
    result.append(calc_funct(row['x'], row['y'], row['z']))
print result