## Recommender System for Cannabis Strains
Project goals: build a bag-of-words based recommender system for recommending cannabis strains similar to the one inputted by the user. This will help with accessibility issues for medical marijuana patients who may not have the prescribed strain available to them locally.

In [9]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

#important nltk packages
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/esha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/esha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Data Import and Cleanup

In [2]:
#read in csv as a dataframe
df = pd.read_csv("cannabiscuratedbinned.csv")

#clean up and check column names
df = df[['Strain','Type','Rating','Effect1','Effect2','Effect3','Effect4','Effect5','Flavor1','Flavor2','Flavor3','Flavor4','Description']]
df.head()
print(list(df.columns))

df[["Description",'Strain','Type','Effect1','Effect2','Effect3','Effect4','Effect5','Flavor1','Flavor2','Flavor3','Flavor4']] = df[["Description",'Strain','Type','Effect1','Effect2','Effect3','Effect4','Effect5','Flavor1','Flavor2','Flavor3','Flavor4']].astype(str)

#initializing new column
df['Key_words'] = ""

df.loc[:, (df != 0).any(axis=0)]


['Strain', 'Type', 'Rating', 'Effect1', 'Effect2', 'Effect3', 'Effect4', 'Effect5', 'Flavor1', 'Flavor2', 'Flavor3', 'Flavor4', 'Description']


Unnamed: 0,Strain,Type,Rating,Effect1,Effect2,Effect3,Effect4,Effect5,Flavor1,Flavor2,Flavor3,Flavor4,Description,Key_words
0,100-Og,hybrid,4.0,Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,citrus,,$100 OG is a 50/50 hybrid strain that packs a ...,
1,98-White-Widow,hybrid,4.7,Relaxed,Aroused,Creative,Happy,Energetic,floral,floral,chemical,,The ‘98 Aloha White Widow is an especially pot...,
2,1024,sativa,4.4,Uplifted,Happy,Relaxed,Energetic,Creative,spice,plant,plant,,1024 is a sativa-dominant hybrid bred in Spain...,
3,13-Dawgs,hybrid,4.2,Tingly,Creative,Hungry,Relaxed,Uplifted,fruity,citrus,citrus,,13 Dawgs is a hybrid of G13 and Chemdawg genet...,
4,24K-Gold,hybrid,4.6,Happy,Relaxed,Euphoric,Uplifted,Talkative,citrus,Earthy,citrus,,"Also known as Kosher Tangie, 24k Gold is a 60%...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2222,Zeus-Og,hybrid,4.7,Happy,Uplifted,Relaxed,Euphoric,Energetic,Earthy,plant,plant,,Zeus OG is a hybrid cross between Pineapple OG...,
2223,Zkittlez,indica,4.6,Relaxed,Happy,Euphoric,Uplifted,Sleepy,Sweet,Berry,fruity,,Zkittlez is an indica-dominant mix of Grape Ap...,
2224,Zombie-Kush,indica,5.0,Relaxed,Sleepy,Talkative,Euphoric,Happy,Earthy,Sweet,spice,,Zombie Kush by Ripper Seeds comes from two dif...,
2225,Zombie-Og,indica,4.4,Relaxed,Sleepy,Euphoric,Happy,Hungry,Sweet,Earthy,Pungent,,If you’re looking to transform into a flesh-ea...,


[nltk_data] Downloading package stopwords to /Users/esha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/esha/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Extract Keywords using Rake

In [10]:
for index, row in df.iterrows():
    desc = row['Description']
    
    r = Rake()
    
    #pulling keywords from strain description
    r.extract_keywords_from_text(desc)
    
    #getting dictionary with key words as keys and scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    #assigning key words to a new column for corresponding strain
    row['Key_words'] =  list(key_words_dict_scores.keys())

In [11]:
#dropping description and ratings columns
df.drop(columns = ['Description'], inplace = True)
df.drop(columns = ['Rating'], inplace = True)
df.drop(columns = ['Flavor4'], inplace = True)

### Build bag of words model

In [12]:
df['bag_of_words'] = df['Type']+df['Effect1']+df['Effect2']+df['Effect3']+df['Effect4']+df['Effect5']+df['Flavor1']+df['Flavor2']+df['Flavor3']+df['Key_words']


In [13]:
#generating the count matrix
count = CountVectorizer(analyzer ='char_wb',ngram_range=(2,3))
count_matrix = count.fit_transform(df['bag_of_words'])
count_matrix_2 = count_matrix.toarray()

In [15]:
#obesrving how the analyzer works
analyze = count.build_analyzer()
analyze('This is my analyzer test')

[' t',
 'th',
 'hi',
 'is',
 's ',
 ' th',
 'thi',
 'his',
 'is ',
 ' i',
 'is',
 's ',
 ' is',
 'is ',
 ' m',
 'my',
 'y ',
 ' my',
 'my ',
 ' a',
 'an',
 'na',
 'al',
 'ly',
 'yz',
 'ze',
 'er',
 'r ',
 ' an',
 'ana',
 'nal',
 'aly',
 'lyz',
 'yze',
 'zer',
 'er ',
 ' t',
 'te',
 'es',
 'st',
 't ',
 ' te',
 'tes',
 'est',
 'st ']

In [16]:
#generate cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix,count_matrix)

#sanity check -- trace of the similarity matrix is number of strains
'Diagonal (sum): ', np.trace(cosine_sim)

('Diagonal (sum): ', 2227.000000000001)

In [17]:
#create a series for the series for strain titles so they're ordered numerically 
indices = pd.Series(df.index)


In [18]:
#define function that takes strain index as input and returns top 10 strain recommendations
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_strains = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_strains.append(list(df.index)[i])
        
    return recommended_strains

In [20]:
#sample call to recommender     
recs = recommendations(1424) #strain 1424 is OG Sour Diesel

#printing the top 10 strains
for i in recs: 
    print(df.loc[i]['Strain'])

Lemon-Diesel
Orange-Diesel
Sour-Lemon-Og
Grapefruit-Diesel
Kali-Dog
Orange-Crush
Tahoe-Hydro-Champagne
Pagoda
Love-Potion
Citrix
