In [1]:
import pandas as pd
import numpy as np

In [2]:
#Falling Fruit Data from:
#https://www.fallingfruit.org/data?c=forager%2Cfreegan&locale=en
#used BZ2 to ZIP converter https://cloudconvert.com/bz2-to-zip

In [3]:
from google.cloud import storage

In [4]:
client = storage.Client(project='juglone-resistance')

In [5]:
buckets = client.list_buckets()

In [6]:
bucket_name = "juglone_resistance"

In [7]:
print("Buckets in {}:".format(client.project))
for item in buckets:
    print("\t" + item.name)

Buckets in juglone-resistance:
	dummydata_ufk
	juglone_resistance


In [8]:
bucket = client.get_bucket(bucket_name)

In [9]:
blobs = bucket.list_blobs()

print("Blobs in {}:".format(bucket.name))
for item in blobs:
    print("\t" + item.name)

Blobs in juglone_resistance:
	locations.csv.bz2
	machinelearning_data.csv
	types.csv.bz2


In [10]:
types = "types.csv.bz2"
locations = "locations.csv.bz2"

In [11]:
types_blob = bucket.get_blob(types)
locations_blob = bucket.get_blob(locations)

In [12]:
#import types data
types_df = pd.read_csv("gs://juglone_resistance/types.csv.bz2", compression='bz2')

In [13]:
#there are dumpsters listed on there and we don't want those
types_df = types_df[types_df['en_name'] != 'Dumpster (edible)']

In [14]:
##select fewer columns
types_df = types_df[['id', 'parent_id', 'scientific_name', 'en_name', 'en_wikipedia_url']]
types_df = types_df.rename(columns={'id': 'type_ids'})
types_df['scientific_name'] = types_df['scientific_name'].str.split(' ', expand=True)[0]
types_df.dropna()

Unnamed: 0,type_ids,parent_id,scientific_name,en_name,en_wikipedia_url
0,1,285.0,Prunus,Plum,http://en.wikipedia.org/wiki/Plum
2,3,263.0,Citrus,Orange,http://en.wikipedia.org/wiki/Citrus_sinensis
3,4,263.0,Citrus,Lemon,http://en.wikipedia.org/wiki/Citrus_limon
4,5,263.0,Citrus,Grapefruit,http://en.wikipedia.org/wiki/Citrus_paradisi
9,11,263.0,Citrus,Pomelo,http://en.wikipedia.org/wiki/Citrus_maxima
12,14,114.0,Malus,Apple,http://en.wikipedia.org/wiki/Malus_domestica
17,19,263.0,Citrus,Kumquat,http://en.wikipedia.org/wiki/Citrus_japonica
18,20,445.0,Ficus,Common fig,http://en.wikipedia.org/wiki/Ficus_carica
19,23,263.0,Citrus,Mandarin,http://en.wikipedia.org/wiki/Mandarin_orange
21,25,1387.0,Citrus,Buddha's hand,http://en.wikipedia.org/wiki/Buddha%27s_hand


In [15]:
#change type for merge
types_df['type_ids'] = types_df['type_ids'].astype('str')

In [16]:
#this file is huge, with data for the entire world
locations_df = pd.read_csv('gs://juglone_resistance/locations.csv.bz2', low_memory=False)

In [17]:
#select necessary columns
locations_df = locations_df[['id', 'type_ids', 'lat', 'lng']]
#get latitude and longitude
locations_df = locations_df.rename(columns={'lng': 'lon'})
# Drop na values
locations_df.dropna()
locations_df.head()

Unnamed: 0,id,type_ids,lat,lon
0,22,3,37.409849,-122.137529
1,23,8,37.412087,-122.140182
2,24,4,37.412043,-122.1397
3,25,3,37.411562,-122.139288
4,26,4,37.411252,-122.138862


In [18]:
# Initial limit of Austin, TX dataset: NW 30.308660, -97.756688 and SE 30.261570, -97.736784
# Limits of final dataset should include all of Austin: NW 30.529060, -97.788274 and SE 30.030380, -97.668365

In [19]:
# Latitude filter
tst_locations_df = locations_df[locations_df['lat'].between(30.030380, 30.529060, inclusive=True)]

# Longitude filter
tst_locations_df = tst_locations_df[locations_df['lon'].between(-97.788274, -97.668365, inclusive=True)]

  """


In [20]:
# move rows with csvs into a new df
tst_locations_df.dropna(inplace=True)
csv_df = tst_locations_df[tst_locations_df['type_ids'].str.contains(',')]
csv_df

Unnamed: 0,id,type_ids,lat,lon
2306,2728,"13, 212, 443, 12",30.228815,-97.755035
2308,2730,"213, 92, 152",30.331831,-97.76088
2310,2732,"214, 213",30.274124,-97.771278
2314,2736,"78, 93, 13, 10",30.277281,-97.771606
2319,2741,"443, 443",30.243017,-97.749779
521557,568336,"689, 50",30.268288,-97.72724
583059,665933,"96, 97, 173",30.401282,-97.705348
583504,666389,"93, 229, 149, 588, 16, 426",30.238282,-97.703155
584597,667553,"943, 938",30.27419,-97.750432
1431348,1756614,"18, 629",30.229653,-97.7847


In [21]:
#exploding csvs
split_df = pd.DataFrame(csv_df.type_ids.str.split(',').tolist(), index=csv_df.id).stack()
split_df = split_df.reset_index()[[0,'id']]
split_df.columns = ['type_ids', 'id']
split_df

Unnamed: 0,type_ids,id
0,13,2728
1,212,2728
2,443,2728
3,12,2728
4,213,2730
5,92,2730
6,152,2730
7,214,2732
8,213,2732
9,78,2736


In [22]:
merge_df = csv_df.merge(split_df, on='id')
append_df = merge_df[['id','type_ids_y', 'lat','lon']]
append_df = append_df.rename(columns={'type_ids_y' : 'type_ids'})

In [23]:
tst_locations_df.append(append_df)

Unnamed: 0,id,type_ids,lat,lon
2306,2728,"13, 212, 443, 12",30.228815,-97.755035
2307,2729,212,30.228193,-97.757248
2308,2730,"213, 92, 152",30.331831,-97.760880
2309,2731,152,30.327974,-97.758644
2310,2732,"214, 213",30.274124,-97.771278
2311,2733,188,30.274878,-97.770798
2312,2734,20,30.308270,-97.749771
2313,2735,213,30.268471,-97.751396
2314,2736,"78, 93, 13, 10",30.277281,-97.771606
2315,2737,10,30.225979,-97.755310


In [24]:
test_df = tst_locations_df.merge(types_df, how='inner', on='type_ids')
test_df.head()

Unnamed: 0,id,type_ids,lat,lon,parent_id,scientific_name,en_name,en_wikipedia_url
0,2729,212,30.228193,-97.757248,,Sapindus,Soapberry,http://en.wikipedia.org/wiki/Sapindus
1,2731,152,30.327974,-97.758644,,Allium,Onion,http://en.wikipedia.org/wiki/Allium
2,594393,152,30.249392,-97.713647,,Allium,Onion,http://en.wikipedia.org/wiki/Allium
3,766028,152,30.446912,-97.757219,,Allium,Onion,http://en.wikipedia.org/wiki/Allium
4,1063498,152,30.288013,-97.763354,,Allium,Onion,http://en.wikipedia.org/wiki/Allium


In [25]:
del test_df['id']
del test_df['type_ids']
del test_df['parent_id']
del test_df['en_wikipedia_url']
test_df.head()

Unnamed: 0,lat,lon,scientific_name,en_name
0,30.228193,-97.757248,Sapindus,Soapberry
1,30.327974,-97.758644,Allium,Onion
2,30.249392,-97.713647,Allium,Onion
3,30.446912,-97.757219,Allium,Onion
4,30.288013,-97.763354,Allium,Onion


In [26]:
test_df['scientific_name'].replace('', np.nan, inplace=True)
test_df.dropna(subset=['scientific_name'], inplace=True)
test_df.dropna()
test_df.count()

lat                23171
lon                23171
scientific_name    23171
en_name            23171
dtype: int64

In [27]:
test_df = test_df.rename(columns={'scientific_name': 's_name'})
test_df.head()
#test_df['en_name']
#test_df.set_index('scientific_name', inplace=True)

Unnamed: 0,lat,lon,s_name,en_name
0,30.228193,-97.757248,Sapindus,Soapberry
1,30.327974,-97.758644,Allium,Onion
2,30.249392,-97.713647,Allium,Onion
3,30.446912,-97.757219,Allium,Onion
4,30.288013,-97.763354,Allium,Onion


In [28]:
test_df = test_df[['s_name','en_name', 'lon', 'lat']]
test_df.head()

Unnamed: 0,s_name,en_name,lon,lat
0,Sapindus,Soapberry,-97.757248,30.228193
1,Allium,Onion,-97.758644,30.327974
2,Allium,Onion,-97.713647,30.249392
3,Allium,Onion,-97.757219,30.446912
4,Allium,Onion,-97.763354,30.288013


In [29]:
#nonutdf = test_df.loc[(test_df['en_name'] != 'Pecan')] #& (test_df['en_name'] != 'Walnut') & (test_df['en_name'] != 'Hickory')]
#nonutdf.head()
nonutdf = test_df[~test_df['s_name'].str.startswith("Juglan", na=False) & ~test_df['s_name'].str.startswith("Carya", na=False)]
nonutdf.count()

s_name     20288
en_name    20288
lon        20288
lat        20288
dtype: int64

In [30]:
nutdf = test_df[test_df['s_name'].str.startswith("Juglan", na=False) | test_df['s_name'].str.startswith("Carya", na=False)]
#nutdf.head()
nutdf.count()
#nutdf.scientific_name.unique()

s_name     2883
en_name    2883
lon        2883
lat        2883
dtype: int64

In [31]:
test_dfi = test_df
test_dfi.head()

Unnamed: 0,s_name,en_name,lon,lat
0,Sapindus,Soapberry,-97.757248,30.228193
1,Allium,Onion,-97.758644,30.327974
2,Allium,Onion,-97.713647,30.249392
3,Allium,Onion,-97.757219,30.446912
4,Allium,Onion,-97.763354,30.288013


In [32]:
# # test_df.to_csv('fulldataset.csv')
# nutdf.to_csv('nutdataset.csv')
# nonutdf.to_csv('nonutdataset.csv')

In [33]:
import math

def haversine(coord1,coord2,coord3,coord4):
    
        lon1,lat1=coord1,coord2
        lon2,lat2=coord3,coord4

        R=6371000                               # radius of Earth in meters
        phi_1=math.radians(lat1)
        phi_2=math.radians(lat2)

        delta_phi=math.radians(lat2-lat1)
        delta_lambda=math.radians(lon2-lon1)

        a=math.sin(delta_phi/2.0)**2+\
        math.cos(phi_1)*math.cos(phi_2)*\
        math.sin(delta_lambda/2.0)**2
        c=2*math.atan2(math.sqrt(a),math.sqrt(1-a))

        meters=R*c                    # output distance in meters
        km=meters/1000.0              # output distance in kilometers
        miles=meters*0.000621371      # output distance in miles
        feet=miles*5280               # output distance in feet
        
        return feet

In [34]:
lon_1 = -97.788274
lat_1 = 30.529060
lon_2 = -97.668365
lat_2 = 30.030380

vect_len = 10 ** 6

In [35]:
# LON_1 = np.array([lon_1 for _ in range(vect_len)])
# LAT_1 = np.array([lat_1 for _ in range(vect_len)])

# LON_2 = np.array([lon_2 for _ in range(vect_len)])
# LAT_2 = np.array([lat_2 for _ in range(vect_len)])

In [36]:
# %%timeit
haversine_v = np.vectorize(haversine)

In [37]:
# %%timeit
# haversine_v(LON_1, LAT_1, LON_2, LAT_2)

In [38]:
def cartesian_product_basic(left, right):
    return (
       nutdf.assign(key=1).merge(nonutdf.assign(key=1), on='key').drop('key', 1))

cjdf = cartesian_product_basic(nutdf, nonutdf)
cjdf.head()
#cartesian_product_basic(pecandf, nonpecandf)

Unnamed: 0,s_name_x,en_name_x,lon_x,lat_x,s_name_y,en_name_y,lon_y,lat_y
0,Carya,Pecan,-97.751884,30.224958,Sapindus,Soapberry,-97.757248,30.228193
1,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.758644,30.327974
2,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.713647,30.249392
3,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.757219,30.446912
4,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.763354,30.288013


In [39]:
cjdf[['lon_x','lat_x','lon_y','lat_y']].head()

Unnamed: 0,lon_x,lat_x,lon_y,lat_y
0,-97.751884,30.224958,-97.757248,30.228193
1,-97.751884,30.224958,-97.758644,30.327974
2,-97.751884,30.224958,-97.713647,30.249392
3,-97.751884,30.224958,-97.757219,30.446912
4,-97.751884,30.224958,-97.763354,30.288013


In [40]:
cjdf.count()

s_name_x     58490304
en_name_x    58490304
lon_x        58490304
lat_x        58490304
s_name_y     58490304
en_name_y    58490304
lon_y        58490304
lat_y        58490304
dtype: int64

In [41]:
LON_1 = np.array([row for row in cjdf['lon_x']])
LAT_1 = np.array([row for row in cjdf['lat_x']])

LON_2 = np.array([row for row in cjdf['lon_y']])
LAT_2 = np.array([row for row in cjdf['lat_y']])

In [42]:
cjdf['distance_ft'] = haversine_v(LON_1, LAT_1, LON_2, LAT_2)

In [43]:
cjdf['distance_ft'].describe()

count    5.849030e+07
mean     2.036391e+04
std      1.240621e+04
min      4.049848e-01
25%      1.127194e+04
50%      1.795345e+04
75%      2.719270e+04
max      1.374621e+05
Name: distance_ft, dtype: float64

In [44]:
cjdf['resistance'] = [ 1 if row <= 50.00 else 0 for row in cjdf['distance_ft'] ]

In [45]:
cjdf.head()

Unnamed: 0,s_name_x,en_name_x,lon_x,lat_x,s_name_y,en_name_y,lon_y,lat_y,distance_ft,resistance
0,Carya,Pecan,-97.751884,30.224958,Sapindus,Soapberry,-97.757248,30.228193,2061.944665,0
1,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.758644,30.327974,37641.837495,0
2,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.713647,30.249392,14989.840658,0
3,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.757219,30.446912,80988.939396,0
4,Carya,Pecan,-97.751884,30.224958,Allium,Onion,-97.763354,30.288013,23285.587464,0


In [46]:
mldf = cjdf[['s_name_x', 's_name_y', 'distance_ft', 'resistance']]

In [47]:
mldf=mldf.rename(columns={'s_name_x': 'juglandacae', 's_name_y': 's_name'})

In [48]:
mldf.head()

Unnamed: 0,juglandacae,s_name,distance_ft,resistance
0,Carya,Sapindus,2061.944665,0
1,Carya,Allium,37641.837495,0
2,Carya,Allium,14989.840658,0
3,Carya,Allium,80988.939396,0
4,Carya,Allium,23285.587464,0


In [49]:
# mldf.to_csv('machinelearning_data.csv')

In [50]:
# tableau_df = cjdf[cjdf['distance_ft'] <= 50]

In [51]:
# tableau_df.to_csv('tableau_data.csv')

In [52]:
# # take a subset of dataset to use here
# walnut_df = mldf[mldf['juglandacae'] == 'Juglans']

In [53]:
# walnut_df.head()

In [54]:
# #how big is my dataset
# len(walnut_df['juglandacae'])

In [55]:
# # make X and Y for machine learning
# X = walnut_df.drop(['resistance', 'juglandacae'], axis=1)
# Y = walnut_df['resistance']

In [56]:
# from sklearn.model_selection import train_test_split

# # create a test-train split
# X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=13, stratify=Y)

In [57]:
# setup pipeline for linear regression cv
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegressionCV

In [58]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['distance_ft']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['s_name']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [59]:
# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
# wlrcv = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', LogisticRegressionCV())])

In [60]:
# wlrcv.fit(X_train, y_train)

In [61]:
# wlrcv.score(X_train, y_train)

In [62]:
# wlrcv.score(X_test, y_test)

In [63]:
# wlrcv.fit(X,Y)

In [64]:
import pickle

In [65]:
# #assign filename and open pkl
# filename = 'walnut_lrcv.pkl'
# model_pkl = open(filename, 'wb')
# #export model on local
# pickle.dump(wlrcv, model_pkl)
# model_pkl.close()

In [66]:
# take a subset of dataset to use here
pecan_df = mldf

In [67]:
pecan_df.head()

Unnamed: 0,juglandacae,s_name,distance_ft,resistance
0,Carya,Sapindus,2061.944665,0
1,Carya,Allium,37641.837495,0
2,Carya,Allium,14989.840658,0
3,Carya,Allium,80988.939396,0
4,Carya,Allium,23285.587464,0


In [68]:
#how big is my dataset
len(pecan_df['juglandacae'])

58490304

In [69]:
predict_types = pecan_df['s_name'].unique()

In [70]:
# make X and Y for machine learning
X = pecan_df[['s_name', 'distance_ft']]
Y = pecan_df['resistance']

In [71]:
from sklearn.model_selection import train_test_split

# create a test-train split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=13, stratify=Y)

In [72]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['distance_ft']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['s_name']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [73]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
from sklearn.linear_model import LogisticRegression
plrcv = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [74]:
plrcv.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [75]:
plrcv.score(X_train, y_train)

0.9999173153550258

In [76]:
plrcv.score(X_test, y_test)

0.999917336727856

In [77]:
plrcv.fit(X, Y)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [78]:
#assign filename and open pkl
filename = 'pecan_lr.pkl'
model_pkl = open(filename, 'wb')
#export model on local
pickle.dump(plrcv, model_pkl)
model_pkl.close()

In [88]:
pecan_model = pickle.load(open('pecan_lr.pkl', 'rb'))

In [81]:
data = {'s_name': predict_types, 'distance_ft' : 45}

In [83]:
predict_df = pd.DataFrame(data)

In [84]:
predict_df['survival'] = plrcv.predict(predict_df[['s_name', 'distance_ft']])

In [96]:
predict_df.to_csv('predict_df.csv', index=False)