# From the Description can we predict the Type

In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LinearRegression, SGDClassifier, LogisticRegression


In [15]:
df_review=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
df=df_review.Description.astype(str)
print(df.shape)
df.head()

(2349,)


0     og   hybrid strain pack strong punch name sup...
1     aloha white widow especially potent cut white...
2     sativa dominant hybrid bred spain medical see...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    known kosher tangie k gold  indica dominant hy...
Name: Description, dtype: object

In [3]:
EDA=pd.read_excel('../CannaConnect/Dataset/EDA.xlsx')
EDA.head()

Unnamed: 0.1,Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Effect_1,Effect_2,Effect_3,Effect_4,Effect_5,Flavor_1,Flavor_2,Flavor_3
0,0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus
1,1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Relaxed,Aroused,Creative,Happy,Energetic,Flowery,Violet,Diesel
2,2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Uplifted,Happy,Relaxed,Energetic,Creative,Spicy/Herbal,Sage,Woody
3,3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly,Creative,Hungry,Relaxed,Uplifted,Apricot,Citrus,Grapefruit
4,4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Happy,Relaxed,Euphoric,Uplifted,Talkative,Citrus,Earthy,Orange


In [16]:
EDA=pd.DataFrame(EDA,columns=['Rating'])
EDA.head()

Unnamed: 0,Rating
0,4.0
1,4.7
2,4.4
3,4.2
4,4.6


In [17]:
des_type=pd.concat([df,EDA],axis=1)
des_type.shape

(2349, 2)

In [18]:
des_type=des_type.dropna()
des_type.shape

(2277, 2)

In [19]:
des_type.head()

Unnamed: 0,Description,Rating
0,og hybrid strain pack strong punch name sup...,4.0
1,aloha white widow especially potent cut white...,4.7
2,sativa dominant hybrid bred spain medical see...,4.4
3,dawgs hybrid g chemdawg genetics bred canadia...,4.2
4,known kosher tangie k gold indica dominant hy...,4.6


In [38]:
X=des_type['Description'].astype(str)
y=des_type['Rating'].astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
nb=Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', MultinomialNB()),
            ])
model=nb.fit(X_train, y_train)
print('Accuracy of NB classifier on training set: {:.2f}'.format(nb.score(X_train, y_train)))
print('Accuracy of NB classifier on test set: {:.2f}'.format(nb.score(X_test, y_test)))
y_pred=nb.predict(X_test)
print('MAE of NB classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of NB classifier on training set: 0.83
Accuracy of NB classifier on test set: 0.86
MAE of NB classifier on test set: 0.21
[[  0   0  11   0]
 [  0   0  17   0]
 [  0   0 392   0]
 [  0   0  36   0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           3       0.00      0.00      0.00        17
           4       0.86      1.00      0.92       392
           5       0.00      0.00      0.00        36

   micro avg       0.86      0.86      0.86       456
   macro avg       0.21      0.25      0.23       456
weighted avg       0.74      0.86      0.79       456



  'precision', 'predicted', average, warn_for)


In [40]:
sgd=Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier()),
            ])
model=sgd.fit(X_train, y_train)
print('Accuracy of SGD classifier on training set: {:.2f}'.format(sgd.score(X_train, y_train)))
print('Accuracy of SGD classifier on test set: {:.2f}'.format(sgd.score(X_test, y_test)))
y_pred=sgd.predict(X_test)
print('MAE of SGD classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of SGD classifier on training set: 1.00
Accuracy of SGD classifier on test set: 0.84
MAE of SGD classifier on test set: 0.24
[[  4   0   4   3]
 [  0   0  16   1]
 [  2   3 374  13]
 [  1   0  30   5]]
              precision    recall  f1-score   support

           0       0.57      0.36      0.44        11
           3       0.00      0.00      0.00        17
           4       0.88      0.95      0.92       392
           5       0.23      0.14      0.17        36

   micro avg       0.84      0.84      0.84       456
   macro avg       0.42      0.36      0.38       456
weighted avg       0.79      0.84      0.81       456





In [41]:
logreg=Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression()),
            ])
model=logreg.fit(X_train, y_train)
print('Accuracy of LR classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of LR classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
y_pred=logreg.predict(X_test)
print('MAE of LR classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of LR classifier on training set: 0.83
Accuracy of LR classifier on test set: 0.86
MAE of LR classifier on test set: 0.21
[[  0   0  11   0]
 [  0   0  17   0]
 [  0   0 392   0]
 [  0   0  36   0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           3       0.00      0.00      0.00        17
           4       0.86      1.00      0.92       392
           5       0.00      0.00      0.00        36

   micro avg       0.86      0.86      0.86       456
   macro avg       0.21      0.25      0.23       456
weighted avg       0.74      0.86      0.79       456



  'precision', 'predicted', average, warn_for)
