# Explore Cannabis Data

In [1]:
# Standard Library Imports
import pickle

# Third-Party Imports
import spacy
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Download spaCy model
# !python -m spacy download en_core_web_md

In [3]:
# Load spaCy model
nlp = spacy.load('en_core_web_md')

### Import Data

In [4]:
## Read in data from local csv/
# Location if not present: https://www.kaggle.com/kingburrito666/cannabis-strains
df = pd.read_csv('cannabis.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


### Wrangle Data

In [5]:
def list_effects(data=df.Effects):
    """Aggregate all unique effects associated with Effects column of DataFrame.
    
    Args:
        data (pandas.Series): DataFrame column to break down. Default set to 'Effects'
    Returns:
        effects_list (list): list of all effects under the Effects column
    """
    effects_list = []
    
    # Split each list of effects and append each, lowered effect to list
    for i in range(0, len(df)):
        effects = data[i].split(",")
        for effect in effects:
            effects_list.append(effect.lower())
    
    return set(effects_list)

In [6]:
# Get unique list of all effects
all_effects = list_effects()
all_effects

{'aroused',
 'creative',
 'dry',
 'energetic',
 'euphoric',
 'focused',
 'giggly',
 'happy',
 'hungry',
 'mouth',
 'none',
 'relaxed',
 'sleepy',
 'talkative',
 'tingly',
 'uplifted'}

In [7]:
# Remove some unnecessary effects
bad_effects = ['dry', 'mouth', 'aroused', 'none']
for effect in bad_effects:
    all_effects.remove(effect)

In [8]:
# Capitalize each effect for future presentation
effects_list = []

for effect in all_effects:
    effects_list.append(effect.capitalize())

In [9]:
# Remove rows with NaN description values
df = df[~df['Description'].isnull()].reset_index()

In [10]:
# Remove rows with 'none' description values
df = df[~(df.Description == 'None')].reset_index()

In [11]:
df = df.drop(columns=['level_0', 'index'])

In [12]:
df

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."
...,...,...,...,...,...,...
2309,Zeus-Og,hybrid,4.7,"Happy,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Woody,Pine",Zeus OG is a hybrid cross between Pineapple OG...
2310,Zkittlez,indica,4.6,"Relaxed,Happy,Euphoric,Uplifted,Sleepy","Sweet,Berry,Grape",Zkittlez is an indica-dominant mix of Grape Ap...
2311,Zombie-Kush,indica,5.0,"Relaxed,Sleepy,Talkative,Euphoric,Happy","Earthy,Sweet,Spicy/Herbal",Zombie Kush by Ripper Seeds comes from two dif...
2312,Zombie-Og,indica,4.4,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Sweet,Earthy,Pungent",If you’re looking to transform into a flesh-ea...


In [13]:
df.to_csv('new_cannabis.csv')

### Natural Language Processing

In [14]:
def preprocessor(doc):
    """Preprocess input text data using spaCy functionality.

    Args:
        doc (list): List of input data to be processed
    Returns:
        new_text (str): New processed document
    """
    doc = nlp(doc)
    new_text = " ".join([token.lemma_.lower() for token in doc if not
                         token.is_stop and not token.is_punct])

    return new_text

# Apply preprocess to data and save to new column
df['Processed'] = df['Description'].apply(preprocessor)

In [15]:
# Instantiate vectorizer and fit it with processed data
vect = TfidfVectorizer(ngram_range=(1, 2))
vect.fit(df['Processed'])

# Transform processed data for fitting of nn model
dtm = vect.transform(df['Processed'])

In [16]:
# Instatiate model and fit it with vectorized data
nn = NearestNeighbors(algorithm='kd_tree', n_neighbors=50, n_jobs=-1)
nn.fit(dtm)



NearestNeighbors(algorithm='kd_tree', n_jobs=-1, n_neighbors=50)

In [17]:
# Create fake description to test model performance
my_desc = "I want something that tastes fruity and is very potent. I need something that will help with pain relief and help me feel calm."

# Process and vectorize fake description
my_desc = preprocessor(my_desc)
my_desc_vect = vect.transform([my_desc])

In [18]:
# Find nearest neighbors of fake description
dist, ind = nn.kneighbors(my_desc_vect)

In [19]:
# List out neighbor distances
dist

array([[1.32774571, 1.34132261, 1.34247405, 1.34402609, 1.34465308,
        1.34582559, 1.34852451, 1.35131796, 1.35575076, 1.36400471,
        1.36480953, 1.36595683, 1.3688508 , 1.37029087, 1.37180431,
        1.37248719, 1.37293696, 1.37312578, 1.37362751, 1.37373564,
        1.37460432, 1.37627892, 1.37635625, 1.37731254, 1.37788522,
        1.3783078 , 1.37890378, 1.37892422, 1.38002041, 1.38046787,
        1.380787  , 1.38086769, 1.38164519, 1.38180503, 1.38183064,
        1.38190182, 1.38335915, 1.38362468, 1.38370642, 1.38398492,
        1.3840598 , 1.38407784, 1.38429949, 1.38439867, 1.38457267,
        1.38482377, 1.38535195, 1.38597194, 1.38623102, 1.38624107]])

In [20]:
# List out neighbor indices
ind

array([[1011,   11, 1657,  986,  476,  172,  276, 1160, 1638, 1784,  669,
         161, 1521,  708, 2034, 1364, 1053, 1601,   76,  314, 1531,  972,
         867, 2304,   79,  472,  892, 1325,  192,  165, 2282, 1961,    0,
        1620, 2073, 1585, 2299, 1028,   46, 1849, 1492, 1202, 1801,  428,
         712,  921,  336,  817,  125, 1308]], dtype=int64)

In [23]:
# Compare model results with fake description
df["Description"][11]

'501st OG, bred by Rare Dankness, is an indica-dominant hybrid that crosses Skywalker OG with Rare Dankness #1. The colorful flowers are tinged with a wide spectrum of green, blue, red, and purple hues that give off a deep piney kush aroma that mixes with the sweetness of grape flavors. The potent indica effects make 501st OG a great nighttime strain to help with pain relief and trouble sleeping.'

In [24]:
## Pickle objects
pickle.dump(effects_list, open('../med-cabinet/static/data/effects_list.pkl', 'wb'))
pickle.dump(vect, open('../vectorizer.pkl', 'wb'))
pickle.dump(nn, open('../model.pkl', 'wb'))