In [1]:
import pandas as pd
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline

In [2]:
# Import the data.
medcab = pd.read_csv('https://raw.githubusercontent.com/PT-Med-Cabinet-7/Data-Science/master/data_wrangling/raw_csv/cannabis.csv')

# Print the shape and number of missing data per column.
print(medcab.shape)
print(medcab.isna().sum())

# Print the head of the dataframe.
medcab.head()

(2351, 6)
Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [3]:
# Change NaNs to string value of "None"
medcab = medcab.fillna(axis=0, value='None')

In [4]:
# Get the values of the Effects feature.
effects = medcab.Effects.str.split(',')

total_effects = []

for nn in range(len(effects)):
    while effects[nn]:
        total_effects.append(effects[nn].pop())

total_effects = set(total_effects)  

print(total_effects)

{'Euphoric', 'Focused', 'Relaxed', 'Hungry', 'Creative', 'None', 'Sleepy', 'Tingly', 'Energetic', 'Giggly', 'Uplifted', 'Mouth', 'Talkative', 'Dry', 'Aroused', 'Happy'}


In [5]:
# Get the values of the Flavor feature.
flavor = medcab.Flavor.str.split(',')

total_flavor = []

for nn in range(len(flavor)):
    while flavor[nn]:
        total_flavor.append(flavor[nn].pop())

total_flavor = set(total_flavor)  
print(total_flavor)

{'Lavender', 'Orange', 'Berry', 'Nutty', 'Ammonia', 'Cheese', 'Pine', 'Pineapple', 'Apricot', 'Butter', 'Tobacco', 'Lime', 'Flowery', 'Pepper', 'Lemon', 'Blueberry', 'Violet', 'Sweet', 'Chemical', 'None', 'Strawberry', 'Fruit', 'Tar', 'Grape', 'Pungent', 'Grapefruit', 'Earthy', 'Diesel', 'Minty', 'Rose', 'Tree', 'Citrus', 'Coffee', 'Peach', 'Menthol', 'Blue', 'Pear', 'Sage', 'Apple', 'Tropical', 'Woody', 'Honey', 'Tea', 'Skunk', 'Mango', 'Mint', 'Vanilla', 'Plum', 'Spicy/Herbal', 'Chestnut'}


### NLP Model

In [6]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(ngram_range=(1, 2),
                        min_df = 5, 
                        max_df = 0.5)
                        

# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(medcab["Description"])

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2351, 7911)


Unnamed: 0,10,10 week,10 weeks,100,11,11 week,11 weeks,12,13,14,14 weeks,15,15 20,16,16 and,17,18,18 and,19,1960s,1970s,1980s,1990s,1996,1st,1st hawaiian,1st place,1st prize,20,20 and,20 indica,20 sativa,20 thc,2000,2002,2003,2004,2005,2006,2007,...,yields of,yields with,you,you are,you can,you feeling,you get,you happy,you have,you in,you into,you ll,you may,you might,you need,you out,you re,you stay,you stuck,you the,you to,you will,you with,your,your appetite,your body,your day,your dosage,your face,your favorite,your head,your mind,your mood,your nose,yourself,zest,zesty,zesty lemon,zion,zombie
0,0.0,0.0,0.0,0.302617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077912,0.100318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.368261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Query documents for similar entries.
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm) # Fitting the dtm that we got from TF-DIF vectorization.

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [8]:
# Test sample.
sample = ['I would like something that puts me to sleep and relax.']
sample_transformed = tfidf.transform(sample)
results = nn.kneighbors(sample_transformed.todense())
results

(array([[1.31026281, 1.3112083 , 1.31636839, 1.32334878, 1.34047976]]),
 array([[2049, 1351, 1049, 1274, 1218]]))

In [9]:
# Checking the output result.
medcab.Description[2049]

'When you’re looking to wind down and relax, Super Kush is there to lend a hand. Daughter of Northern Lights #5 and Hindu Kush, Super Kush is a very clear-headed indica. This strain is best for users who would like to ease stress without being stuck on the couch. An all-around easy strain for beginning cannabis users, Super Kush is both mild in effects and easy to grow. This strain grows best indoors and typically has an early flowering time of 7-8 weeks.'

### Test your input here!

In [10]:
# Function for user testing.

user_input = input()

user_input_transformed = tfidf.transform([user_input])
user_result = nn.kneighbors(user_input_transformed.todense())

one_strain = medcab['Strain'][user_result[1][0][0]]
one_description = medcab['Description'][user_result[1][0][0]]
two = user_result[1][0][1]
three = user_result[1][0][2]

print('---------')
print('Based on your input we believe you will like the following strains: \n')
print(f'Strain: {one_strain}\n')
print(f'Description: {one_description}\n')

I would like something that puts me to sleep and relax.
---------
Based on your input we believe you will like the following strains: 

Strain: Super-Kush

Description: When you’re looking to wind down and relax, Super Kush is there to lend a hand. Daughter of Northern Lights #5 and Hindu Kush, Super Kush is a very clear-headed indica. This strain is best for users who would like to ease stress without being stuck on the couch. An all-around easy strain for beginning cannabis users, Super Kush is both mild in effects and easy to grow. This strain grows best indoors and typically has an early flowering time of 7-8 weeks.



### Pipeline

In [11]:
# Function to assemble the model in Flask.

def production_model(text):
    input_transformed = tfidf.transform([text])
    input_results = nn.kneighbors(input_transformed.todense())
    return results


In [12]:
# Testing if the json function works.

strains = medcab.iloc[results[1][0]]['Strain'].to_list()
strains

['Super-Kush',
 'Marionberry-Kush',
 'Holy-Ghost',
 'Lemon-Sativa',
 'La-Confidential']

In [13]:
# Json function to be used in flask.

def jsonify(strains):

    recommendation_dictionaries = []
    
    for i in range(1):
        rec = medcab[medcab['Strain']== strains[i]].reset_index()
        rec.columns =  ['id', 'strain', 'type','rating', 'effect', 'flavor', 'description']
        dictionary = rec.to_dict()
        rec_final = json.dumps(dictionary) 
        recommendation_dictionaries.append(rec_final)
    return recommendation_dictionaries

In [14]:
# Output

test = jsonify(strains)
test

['{"id": {"0": 2049}, "strain": {"0": "Super-Kush"}, "type": {"0": "indica"}, "rating": {"0": 4.2}, "effect": {"0": "Relaxed,Happy,Euphoric,Giggly,Aroused"}, "flavor": {"0": "Earthy,Minty,Flowery"}, "description": {"0": "When you\\u2019re looking to wind down and relax, Super Kush is there to lend a hand. Daughter of Northern Lights #5 and Hindu Kush, Super Kush is a very clear-headed indica. This strain is best for users who would like to ease stress without being stuck on the couch. An all-around easy strain for beginning cannabis users, Super Kush is both mild in effects and easy to grow. This strain grows best indoors and typically has an early flowering time of 7-8 weeks."}}']

### Using Pickle

In [None]:
# Pickle the files.

import pickle

In [None]:
# Dump the tfidf with Pickle
tfidf_pkl_filename = 'tfidf_vectorizer.pkl'

# Open the file to save as pkl file
tfidf_pkl = open(tfidf_pkl_filename, 'wb')
pickle.dump(tfidf, tfidf_pkl)

# Close the pickle instances
tfidf_pkl.close()

In [None]:
# Dump the NeirestNeighbors with Pickle
nn_pkl_filename = 'nearestneighbors.pkl'

# Open the file to save as pkl file
nn_pkl = open(nn_pkl_filename, 'wb')
pickle.dump(nn, nn_pkl)

# Close the pickle instances
nn_pkl.close()