In [2]:
!pip install category_encoders==2.*



In [3]:
import numpy as np
import requests
import json
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
#import os
#from dotenv import load_dotenv
#load_dotenv()
#API_KEY = os.getenv("API_KEY")

API_KEY = 'uEH5NtA'

request_url = f"http://strainapi.evanbusse.com/{API_KEY}/strains/search/all"
response = requests.get(request_url)
data = json.loads(response.text)

print(request_url)
print(type(response)) #> <class 'requests.models.Response'>
print(response.status_code) #> 200
print(type(response.text)) #> <class 'str'>
print(type(data)) #> <class 'dict'>

http://strainapi.evanbusse.com/uEH5NtA/strains/search/all
<class 'requests.models.Response'>
200
<class 'str'>
<class 'dict'>


In [4]:
# Get all strains
strain_names = pd.DataFrame(list(data.keys()), columns =['name'])

# Get strain characteristics
strain_values = list(data.values())
strain_values = pd.DataFrame.from_dict(strain_values)

# Add strain name to the dataframe
strains_raw = pd.concat([strain_names, strain_values], axis=1)

# Take a look at the data
strains_raw.head()

Unnamed: 0,name,id,race,flavors,effects
0,Afpak,1,hybrid,"[Earthy, Chemical, Pine]","{'positive': ['Relaxed', 'Hungry', 'Happy', 'S..."
1,African,2,sativa,"[Spicy/Herbal, Pungent, Earthy]","{'positive': ['Euphoric', 'Happy', 'Creative',..."
2,Afternoon Delight,3,hybrid,"[Pepper, Flowery, Pine]","{'positive': ['Relaxed', 'Hungry', 'Euphoric',..."
3,Afwreck,4,hybrid,"[Pine, Earthy, Flowery]","{'positive': ['Relaxed', 'Happy', 'Creative', ..."
4,Agent Orange,5,hybrid,"[Citrus, Orange, Sweet]","{'positive': ['Relaxed', 'Euphoric', 'Happy', ..."


In [5]:
# Get a line for each flavor
strains = strains_raw.flavors.apply(pd.Series) \
    .merge(strains_raw, right_index = True, left_index = True) \
    .drop(["flavors"], axis = 1) \
    .melt(id_vars = ['name', 'id', 'race', 'effects'], value_name = "flavor") \
    .drop("variable", axis = 1) \
    .dropna()

# Get a line for each type of effect
strains = strains.effects.apply(pd.Series) \
    .merge(strains, right_index = True, left_index = True) \
    .drop(["effects"], axis = 1)

# Get a line for each positive effect
strains = strains.positive.apply(pd.Series) \
    .merge(strains, right_index = True, left_index = True) \
    .drop(["positive"], axis = 1) \
    .melt(id_vars = ['name', 'id', 'race', 'flavor', 'negative', 'medical'], value_name = "positive_effect") \
    .drop("variable", axis = 1) \
    .dropna()

# Get a line for each negative effect
strains = strains.negative.apply(pd.Series) \
    .merge(strains, right_index = True, left_index = True) \
    .drop(["negative"], axis = 1) \
    .melt(id_vars = ['name', 'id', 'race', 'flavor', 'positive_effect', 'medical'], value_name = "negative_effect") \
    .drop("variable", axis = 1) \
    .dropna()

# Get a line for each medical effect
strains = strains.medical.apply(pd.Series) \
    .merge(strains, right_index = True, left_index = True) \
    .drop(["medical"], axis = 1) \
    .melt(id_vars = ['name', 'id', 'race', 'flavor', 'positive_effect', 'negative_effect'], \
          value_name = "medical_effect") \
    .drop("variable", axis = 1) \
    .dropna() \
    .sort_values(['id', 'flavor', 'positive_effect', 'negative_effect', 'medical_effect'], \
                 ascending = (1,1,1,1,1))

strains.head()

Unnamed: 0,name,id,race,flavor,positive_effect,negative_effect,medical_effect
10912,Afpak,1,hybrid,Chemical,Happy,Dizzy,Depression
77176,Afpak,1,hybrid,Chemical,Happy,Dizzy,Insomnia
275968,Afpak,1,hybrid,Chemical,Happy,Dizzy,Lack of Appetite
143440,Afpak,1,hybrid,Chemical,Happy,Dizzy,Pain
209704,Afpak,1,hybrid,Chemical,Happy,Dizzy,Stress


In [6]:
# OneHot Encoding
# Drop the strain names, so they won't be included in Encoder
X = strains.drop('name', axis=1)
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)
X_dict = X.to_dict(orient='records') # turn each row as key-value pairs

# instantiate a Dictvectorizer object for X
X_dv = DictVectorizer(sparse=False)
# sparse = False makes the output is not a sparse matrix

# apply strains_dv on strains_dict
X_encoded = X_dv.fit_transform(X_dict)

# vocabulary
vocab = X_dv.vocabulary_

# Get dummies
X = pd.get_dummies(X, prefix_sep='_')
# X head
X.head()

Unnamed: 0,id,race_hybrid,race_indica,race_sativa,flavor_Ammonia,flavor_Apple,flavor_Apricot,flavor_Berry,flavor_Blue Cheese,flavor_Blueberry,...,medical_effect_Headaches,medical_effect_Inflammation,medical_effect_Insomnia,medical_effect_Lack of Appetite,medical_effect_Muscle Spasms,medical_effect_Nausea,medical_effect_Pain,medical_effect_Seizures,medical_effect_Spasticity,medical_effect_Stress
10912,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77176,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
275968,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
143440,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
209704,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# Get only one strain per line
unique_ids = X['id'].unique()
cols = X.columns[1:]
df_sum = []
for id in unique_ids:
    condition = X['id']==id
    data = X[condition]
    for col in cols:
        data[col] = data[col].replace({0:np.nan})
    df = data.fillna(method='ffill')[-1:].fillna(0)
    df_sum.append(df.values.tolist())
df=pd.DataFrame(df_sum)
df = df[0].apply(pd.Series)
column_names = X.columns
df.columns=column_names

print(df.shape)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


(1557, 85)


Unnamed: 0,id,race_hybrid,race_indica,race_sativa,flavor_Ammonia,flavor_Apple,flavor_Apricot,flavor_Berry,flavor_Blue Cheese,flavor_Blueberry,...,medical_effect_Headaches,medical_effect_Inflammation,medical_effect_Insomnia,medical_effect_Lack of Appetite,medical_effect_Muscle Spasms,medical_effect_Nausea,medical_effect_Pain,medical_effect_Seizures,medical_effect_Spasticity,medical_effect_Stress
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [8]:
# Add strain name and rearrange columns
id_names = strains_raw[['id', 'name']]
df = df.merge(id_names, on='id')
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df.head(3)

Unnamed: 0,name,id,race_hybrid,race_indica,race_sativa,flavor_Ammonia,flavor_Apple,flavor_Apricot,flavor_Berry,flavor_Blue Cheese,...,medical_effect_Headaches,medical_effect_Inflammation,medical_effect_Insomnia,medical_effect_Lack of Appetite,medical_effect_Muscle Spasms,medical_effect_Nausea,medical_effect_Pain,medical_effect_Seizures,medical_effect_Spasticity,medical_effect_Stress
0,Afpak,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,African,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,Afternoon Delight,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [9]:
# Save to csv
df.to_csv('strains_db.csv', sep=',', index=False)

In [10]:
df.head()

Unnamed: 0,name,id,race_hybrid,race_indica,race_sativa,flavor_Ammonia,flavor_Apple,flavor_Apricot,flavor_Berry,flavor_Blue Cheese,...,medical_effect_Headaches,medical_effect_Inflammation,medical_effect_Insomnia,medical_effect_Lack of Appetite,medical_effect_Muscle Spasms,medical_effect_Nausea,medical_effect_Pain,medical_effect_Seizures,medical_effect_Spasticity,medical_effect_Stress
0,Afpak,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,African,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,Afternoon Delight,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,Afwreck,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,Agent Orange,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [11]:
import pandas as pd
import numpy
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import wandb
from wandb.keras import WandbCallback
from category_encoders import TargetEncoder
from category_encoders import OrdinalEncoder
from tensorflow import keras

In [16]:
target = df['name']
features = df.drop(columns = 'name')

In [17]:
X_train = features
y_train = target

In [18]:
X_train.shape, y_train.shape

((1557, 85), (1557,))