# Explore Cannabis Data

In [1]:
# ALL IMPORTS
import pandas as pd
import pickle
import spacy
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.1/en_core_web_md-2.3.1.tar.gz (50.8 MB)
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_md')


### Import Data

In [15]:
# Read in data
df = pd.read_csv('cannabis.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


### Wrangle Data

In [3]:
def list_effects(data=df.Effects):
    """Aggregate all effects associated with Effects column of DataFrame.
    
    Args:
        data (pandas.Series): DataFrame column to break down. Default set to 'Effects'
    Returns:
        effects_list (list): list of all effects under the Effects column
    """
    effects_list = []
    
    # Split each list of effects and append each, lowered effect to list
    for i in range(0, len(df)):
        effects = data[i].split(",")
        for effect in effects:
            effects_list.append(effect.lower())
    
    return effects_list

In [4]:
# Get unique list of all effects
all_effects = set(list_effects())
all_effects

{'aroused',
 'creative',
 'dry',
 'energetic',
 'euphoric',
 'focused',
 'giggly',
 'happy',
 'hungry',
 'mouth',
 'none',
 'relaxed',
 'sleepy',
 'talkative',
 'tingly',
 'uplifted'}

In [5]:
### Pickle effects list
#pickle.dump(all_effects, open('effects_list.pkl', 'wb'))

In [16]:
# Remove rows with NaN description values
df = df[~df["Description"].isnull()].reset_index()

In [7]:
# Explore Desc column
df.Description[0]

'$100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.'

### Natural Language Processing

In [20]:
nlp = spacy.load("en_core_web_md")

In [21]:
def preprocessor(doc):
    doc = nlp(doc)
    
    return " ".join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

In [22]:
# Alternative
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

df['word_vects'] = df.Description.apply(get_word_vectors)

KeyboardInterrupt: 

In [20]:
vect = TfidfVectorizer(preprocessor=preprocessor)
# Pickle this vectorizer
vect.fit(df['Description'])
dtm = vect.transform(df['Description'])

In [23]:
my_desc = ["I want something that tastes fruity and is very potent. I need something that will help with pain relief and help me feel calm."]
my_desc_vect = [nlp(doc).vector for doc in my_desc]

In [26]:
### Model
nn = NearestNeighbors(algorithm='kd_tree', n_jobs=-1)
nn.fit(df["word_vects"])



NearestNeighbors(algorithm='kd_tree', n_jobs=-1)

In [29]:
dist, ind = nn.kneighbors(my_desc_vect)

In [34]:
dist

array([[1.        , 1.        , 1.        , 1.        , 1.26573812]])

In [33]:
df["Description"][1722]

'A spin on the citrus-loaded Tangie sativa, Purple Tangie is a flavorful strain that invigorates the spirit with uplifting euphoria. In a show of its quality, Purple Tangie took 1st place in the “Best Medical Sativa Concentrate” category at the 2016 High Times Cannabis Cup in Los Angeles. You may find that Purple Tangie offers an energetic kick to your social skills, but this strain can certainly be enjoyed in isolation, especially when you’re occupied by creative projects.'

### NLP not seeming too impressive, try LSTM Analysis instead.

In [17]:
def avg_desc_length(data=df.Description):
    lengths = []
    
    for i in range(0, len(df)):
        length = len(data[i])
        lengths.append(length)
    
    start_length = 0
    for length in lengths:
        start_length += length
    
    avg = start_length // len(df)
    
    return avg

In [19]:
avg_desc_length()

454

In [None]:
seq_length = 500


In [None]:
### Model (Baseline)
baseline = Sequential()

baseline.add(Embedding(len(), 64, input_length=seq_length))
baseline.add(LSTM(128))