# From the Description can we predict the Type

In [21]:
import re
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LinearRegression, SGDClassifier, LogisticRegression


In [78]:
df_review=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
df=df_review.Description.astype(str)
df.head()

0     og   hybrid strain pack strong punch name sup...
1     aloha white widow especially potent cut white...
2     sativa dominant hybrid bred spain medical see...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    known kosher tangie k gold  indica dominant hy...
Name: Description, dtype: object

In [79]:
EDA=pd.read_excel('../CannaConnect/Dataset/EDA.xlsx')
EDA.head()

Unnamed: 0.1,Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Effect_1,Effect_2,Effect_3,Effect_4,Effect_5,Flavor_1,Flavor_2,Flavor_3
0,0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus
1,1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Relaxed,Aroused,Creative,Happy,Energetic,Flowery,Violet,Diesel
2,2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Uplifted,Happy,Relaxed,Energetic,Creative,Spicy/Herbal,Sage,Woody
3,3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly,Creative,Hungry,Relaxed,Uplifted,Apricot,Citrus,Grapefruit
4,4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Happy,Relaxed,Euphoric,Uplifted,Talkative,Citrus,Earthy,Orange


In [80]:
EDA=pd.DataFrame(EDA,columns=['Type'])
EDA.head()

Unnamed: 0,Type
0,hybrid
1,hybrid
2,sativa
3,hybrid
4,hybrid


In [83]:
des_type=pd.concat([df,EDA],axis=1)
des_type.shape

(2349, 2)

In [84]:
des_type=des_type.dropna()
des_type.shape

(2277, 2)

In [85]:
des_type.head()

Unnamed: 0,Description,Type
0,og hybrid strain pack strong punch name sup...,hybrid
1,aloha white widow especially potent cut white...,hybrid
2,sativa dominant hybrid bred spain medical see...,sativa
3,dawgs hybrid g chemdawg genetics bred canadia...,hybrid
4,known kosher tangie k gold indica dominant hy...,hybrid


In [86]:
# one hot encoding
lb_make = LabelEncoder()
des_type['Type'] = lb_make.fit_transform(des_type['Type'].astype(str))

In [87]:
X=des_type['Description']
y=des_type['Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
nb=Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', MultinomialNB()),
            ])
model=nb.fit(X_train, y_train)
print('Accuracy of NB classifier on training set: {:.2f}'.format(nb.score(X_train, y_train)))
print('Accuracy of NB classifier on test set: {:.2f}'.format(nb.score(X_test, y_test)))
y_pred=nb.predict(X_test)
print('MAE of NB classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of NB classifier on training set: 0.61
Accuracy of NB classifier on test set: 0.54
MAE of NB classifier on test set: 0.62
[[242   0   0]
 [130   5   3]
 [ 76   0   0]]
              precision    recall  f1-score   support

           0       0.54      1.00      0.70       242
           1       1.00      0.04      0.07       138
           2       0.00      0.00      0.00        76

   micro avg       0.54      0.54      0.54       456
   macro avg       0.51      0.35      0.26       456
weighted avg       0.59      0.54      0.39       456



In [89]:
sgd=Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier()),
            ])
model=sgd.fit(X_train, y_train)
print('Accuracy of SGD classifier on training set: {:.2f}'.format(sgd.score(X_train, y_train)))
print('Accuracy of SGD classifier on test set: {:.2f}'.format(sgd.score(X_test, y_test)))
y_pred=sgd.predict(X_test)
print('MAE of SGD classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of SGD classifier on training set: 1.00
Accuracy of SGD classifier on test set: 0.78
MAE of SGD classifier on test set: 0.29
[[216  18   8]
 [ 41  91   6]
 [ 25   2  49]]
              precision    recall  f1-score   support

           0       0.77      0.89      0.82       242
           1       0.82      0.66      0.73       138
           2       0.78      0.64      0.71        76

   micro avg       0.78      0.78      0.78       456
   macro avg       0.79      0.73      0.75       456
weighted avg       0.78      0.78      0.78       456





In [90]:
logreg=Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression()),
            ])
model=logreg.fit(X_train, y_train)
print('Accuracy of LR classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of LR classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
y_pred=logreg.predict(X_test)
print('MAE of LR classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of LR classifier on training set: 0.88
Accuracy of LR classifier on test set: 0.77
MAE of LR classifier on test set: 0.33
[[230   9   3]
 [ 50  85   3]
 [ 41   0  35]]
              precision    recall  f1-score   support

           0       0.72      0.95      0.82       242
           1       0.90      0.62      0.73       138
           2       0.85      0.46      0.60        76

   micro avg       0.77      0.77      0.77       456
   macro avg       0.82      0.68      0.72       456
weighted avg       0.80      0.77      0.76       456



