In [1]:
import pandas as pd
import numpy as np

In [2]:
#Falling Fruit Data from:
#https://www.fallingfruit.org/data?c=forager%2Cfreegan&locale=en
#used BZ2 to ZIP converter https://cloudconvert.com/bz2-to-zip

In [3]:
#import types data
types_df = pd.read_csv('Resources/types.csv')

In [4]:
#there are dumpsters listed on there and we don't want those
types_df = types_df[types_df['en_name'] != 'Dumpster (edible)']

In [5]:
##select fewer columns
types_df = types_df[['id', 'parent_id', 'scientific_name', 'en_name', 'en_wikipedia_url']]
types_df = types_df.rename(columns={'id': 'type_ids'})
types_df.dropna()

Unnamed: 0,type_ids,parent_id,scientific_name,en_name,en_wikipedia_url
0,1,285.0,Prunus,Plum,http://en.wikipedia.org/wiki/Plum
2,3,263.0,Citrus x sinensis,Orange,http://en.wikipedia.org/wiki/Citrus_sinensis
3,4,263.0,Citrus x limon,Lemon,http://en.wikipedia.org/wiki/Citrus_limon
4,5,263.0,Citrus x paradisi,Grapefruit,http://en.wikipedia.org/wiki/Citrus_paradisi
9,11,263.0,Citrus maxima,Pomelo,http://en.wikipedia.org/wiki/Citrus_maxima
12,14,114.0,Malus pumila,Apple,http://en.wikipedia.org/wiki/Malus_domestica
17,19,263.0,Citrus japonica,Kumquat,http://en.wikipedia.org/wiki/Citrus_japonica
18,20,445.0,Ficus carica,Common fig,http://en.wikipedia.org/wiki/Ficus_carica
19,23,263.0,Citrus reticulata,Mandarin,http://en.wikipedia.org/wiki/Mandarin_orange
21,25,1387.0,Citrus medica var. sarcodactyli,Buddha's hand,http://en.wikipedia.org/wiki/Buddha%27s_hand


In [6]:
#change type for merge
types_df['type_ids'] = types_df['type_ids'].astype('str')

In [7]:
#this file is huge, with data for the entire world
locations_df = pd.read_csv('Resources/locations.csv', low_memory=False)

In [8]:
#select necessary columns
locations_df = locations_df[['id', 'type_ids', 'lat', 'lng']]
#get latitude and longitude
locations_df = locations_df.rename(columns={'lng': 'lon'})
# Drop na values
locations_df.dropna()
locations_df.head()

Unnamed: 0,id,type_ids,lat,lon
0,22,3,37.409849,-122.137529
1,23,8,37.412087,-122.140182
2,24,4,37.412043,-122.1397
3,25,3,37.411562,-122.139288
4,26,4,37.411252,-122.138862


In [9]:
# Initial limit of Austin, TX dataset: NW 30.308660, -97.756688 and SE 30.261570, -97.736784
# Limits of final dataset should include all of Austin: NW 30.529060, -97.788274 and SE 30.030380, -97.668365

In [10]:
# Latitude filter
tst_locations_df = locations_df[locations_df['lat'].between(30.030380, 30.529060, inclusive=True)]

# Longitude filter
tst_locations_df = tst_locations_df[locations_df['lon'].between(-97.788274, -97.668365, inclusive=True)]

  """


In [11]:
tst_locations_df.head()

Unnamed: 0,id,type_ids,lat,lon
2306,2728,"13, 212, 443, 12",30.228815,-97.755035
2307,2729,212,30.228193,-97.757248
2308,2730,"213, 92, 152",30.331831,-97.76088
2309,2731,152,30.327974,-97.758644
2310,2732,"214, 213",30.274124,-97.771278


In [12]:
test_df = tst_locations_df.merge(types_df, how='inner', on='type_ids')
test_df.head()

Unnamed: 0,id,type_ids,lat,lon,parent_id,scientific_name,en_name,en_wikipedia_url
0,2729,212,30.228193,-97.757248,,Sapindus,Soapberry,http://en.wikipedia.org/wiki/Sapindus
1,2731,152,30.327974,-97.758644,,Allium,Onion,http://en.wikipedia.org/wiki/Allium
2,594393,152,30.249392,-97.713647,,Allium,Onion,http://en.wikipedia.org/wiki/Allium
3,766028,152,30.446912,-97.757219,,Allium,Onion,http://en.wikipedia.org/wiki/Allium
4,1063498,152,30.288013,-97.763354,,Allium,Onion,http://en.wikipedia.org/wiki/Allium


In [13]:
del test_df['id']
del test_df['type_ids']
del test_df['parent_id']
del test_df['en_wikipedia_url']
test_df.head()

Unnamed: 0,lat,lon,scientific_name,en_name
0,30.228193,-97.757248,Sapindus,Soapberry
1,30.327974,-97.758644,Allium,Onion
2,30.249392,-97.713647,Allium,Onion
3,30.446912,-97.757219,Allium,Onion
4,30.288013,-97.763354,Allium,Onion


In [14]:
test_df['scientific_name'].replace('', np.nan, inplace=True)
test_df.dropna(subset=['scientific_name'], inplace=True)
test_df.dropna()
test_df.count()

lat                23165
lon                23165
scientific_name    23165
en_name            23165
dtype: int64

In [15]:
test_df = test_df.rename(columns={'scientific_name': 's_name'})
test_df.head()
#test_df['en_name']
#test_df.set_index('scientific_name', inplace=True)

Unnamed: 0,lat,lon,s_name,en_name
0,30.228193,-97.757248,Sapindus,Soapberry
1,30.327974,-97.758644,Allium,Onion
2,30.249392,-97.713647,Allium,Onion
3,30.446912,-97.757219,Allium,Onion
4,30.288013,-97.763354,Allium,Onion


In [16]:
test_df = test_df[['s_name','en_name', 'lon', 'lat']]
test_df.head()

Unnamed: 0,s_name,en_name,lon,lat
0,Sapindus,Soapberry,-97.757248,30.228193
1,Allium,Onion,-97.758644,30.327974
2,Allium,Onion,-97.713647,30.249392
3,Allium,Onion,-97.757219,30.446912
4,Allium,Onion,-97.763354,30.288013


In [17]:
#nonutdf = test_df.loc[(test_df['en_name'] != 'Pecan')] #& (test_df['en_name'] != 'Walnut') & (test_df['en_name'] != 'Hickory')]
#nonutdf.head()
nonutdf = test_df[~test_df['s_name'].str.startswith("Juglan", na=False) & ~test_df['s_name'].str.startswith("Carya", na=False)]
nonutdf.count()

s_name     20282
en_name    20282
lon        20282
lat        20282
dtype: int64

In [18]:
nutdf = test_df[test_df['s_name'].str.startswith("Juglan", na=False) | test_df['s_name'].str.startswith("Carya", na=False)]
#nutdf.head()
nutdf.count()
#nutdf.scientific_name.unique()

s_name     2883
en_name    2883
lon        2883
lat        2883
dtype: int64

In [19]:
test_dfi = test_df
test_dfi.head()

Unnamed: 0,s_name,en_name,lon,lat
0,Sapindus,Soapberry,-97.757248,30.228193
1,Allium,Onion,-97.758644,30.327974
2,Allium,Onion,-97.713647,30.249392
3,Allium,Onion,-97.757219,30.446912
4,Allium,Onion,-97.763354,30.288013


In [20]:
test_df.to_csv('fulldataset.csv')
nutdf.to_csv('nutdataset.csv')
nonutdf.to_csv('nonutdataset.csv')

In [21]:
import math

def haversine(coord1,coord2,coord3,coord4):
    
        lon1,lat1=coord1,coord2
        lon2,lat2=coord3,coord4

        R=6371000                               # radius of Earth in meters
        phi_1=math.radians(lat1)
        phi_2=math.radians(lat2)

        delta_phi=math.radians(lat2-lat1)
        delta_lambda=math.radians(lon2-lon1)

        a=math.sin(delta_phi/2.0)**2+\
        math.cos(phi_1)*math.cos(phi_2)*\
        math.sin(delta_lambda/2.0)**2
        c=2*math.atan2(math.sqrt(a),math.sqrt(1-a))

        meters=R*c                    # output distance in meters
        km=meters/1000.0              # output distance in kilometers
        miles=meters*0.000621371      # output distance in miles
        feet=miles*5280               # output distance in feet
        
        return feet

In [22]:
# cached_distances = {} # key = Rowij, value = haversince distance

In [23]:
# function to calc distance between two records

In [24]:
# len(test_df)

In [25]:
# for index, row in test_df.iterrows():
    # complement_df = test_df.iloc[[i for i in range(len(test_df)) if i!=index]]
    # for index2, row2 in complement_df.iterrows():
        # print(index, index2)

In [26]:
lon_1 = -97.788274
lat_1 = 30.529060
lon_2 = -97.668365
lat_2 = 30.030380

vect_len = 10 ** 6

In [27]:
LON_1 = np.array([lon_1 for _ in range(vect_len)])
LAT_1 = np.array([lat_1 for _ in range(vect_len)])

LON_2 = np.array([lon_2 for _ in range(vect_len)])
LAT_2 = np.array([lat_2 for _ in range(vect_len)])

In [28]:
%%timeit
haversine_v = np.vectorize(haversine)

4.56 µs ± 757 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [29]:
def cartesian_product_basic(left, right):
    return (
       nutdf.assign(key=1).merge(nonutdf.assign(key=1), on='key').drop('key', 1))

cjdf = cartesian_product_basic(nutdf, nonutdf)
cjdf.head()
#cartesian_product_basic(pecandf, nonpecandf)

Unnamed: 0,s_name_x,en_name_x,lon_x,lat_x,s_name_y,en_name_y,lon_y,lat_y
0,Carya illinoinensis,Pecan,-97.751884,30.224958,Sapindus,Soapberry,-97.757248,30.228193
1,Carya illinoinensis,Pecan,-97.751884,30.224958,Allium,Onion,-97.758644,30.327974
2,Carya illinoinensis,Pecan,-97.751884,30.224958,Allium,Onion,-97.713647,30.249392
3,Carya illinoinensis,Pecan,-97.751884,30.224958,Allium,Onion,-97.757219,30.446912
4,Carya illinoinensis,Pecan,-97.751884,30.224958,Allium,Onion,-97.763354,30.288013


In [30]:
cjdf[['lon_x','lat_x','lon_y','lat_y']].head()

Unnamed: 0,lon_x,lat_x,lon_y,lat_y
0,-97.751884,30.224958,-97.757248,30.228193
1,-97.751884,30.224958,-97.758644,30.327974
2,-97.751884,30.224958,-97.713647,30.249392
3,-97.751884,30.224958,-97.757219,30.446912
4,-97.751884,30.224958,-97.763354,30.288013


In [31]:
cjdf.count()

s_name_x     58473006
en_name_x    58473006
lon_x        58473006
lat_x        58473006
s_name_y     58473006
en_name_y    58473006
lon_y        58473006
lat_y        58473006
dtype: int64

In [None]:
for index, row in cjdf.iterrows():
    lon_1 = cjdf['lon_x']
    lat_1 = cjdf['lat_x']
    lon_2 = cjdf['lon_y']
    lat_2 = cjdf['lat_y']
    haversine_v = haversine(lon_1,lat_1,lon_2,lat_2)
    #cjdf.append(haversine_v)
    print(haversine_v)

In [None]:
result = []
for index, row in cjdf.iterrows():
    lon_1 = cjdf['lon_x']
    lat_1 = cjdf['lat_x']
    lon_2 = cjdf['lon_y']
    lat_2 = cjdf['lon_x']
    haversine_v = np.vectorize(haversine)
    result.append(haversine_v)
print(result)

In [89]:
haversine_v(LON_1, LAT_1, LON_2, LAT_2)

array([185805.44461884, 185805.44461884, 185805.44461884, ...,
       185805.44461884, 185805.44461884, 185805.44461884])

In [86]:
LONG_1.shape

(1000000,)

In [78]:
%%timeit

for i in range(0, 1_000_000):
    haversine(-97.788274,30.529060,-97.668365,30.030380)

3.88 s ± 338 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
