# Text Analysis through Natural Language Processing, Understanding and Classification

# 1 - Importing NLP packages

In [1]:
from wordcloud import WordCloud, STOPWORDS
from yellowbrick.text import FreqDistVisualizer
from sklearn.feature_extraction import stop_words
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from pactools.grid_search import GridSearchCVProgressBar

from gensim import models, corpora
from gensim.sklearn_api import HdpTransformer
from gensim.test.utils import common_dictionary, common_corpus
from gensim.scripts.glove2word2vec import glove2word2vec

from textblob import TextBlob, Word
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *

import string
import re
import scipy
import pyLDAvis

import pandas as pd # to dataframes
import matplotlib.pyplot as plt #to define plot parameters
import seaborn as sns #to graphical plots
import numpy as np #to math
plt.style.use('ggplot') # to plot graphs with gggplot2 style

import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
  return f(*args, **kwds)


In [2]:
EDA=pd.read_excel('../CannaConnect/Dataset/EDA.xlsx')
EDA.head()

Unnamed: 0.1,Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Effect_1,Effect_2,Effect_3,Effect_4,Effect_5,Flavor_1,Flavor_2,Flavor_3,Flavor_4
0,0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus,
1,1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Relaxed,Aroused,Creative,Happy,Energetic,Flowery,Violet,Diesel,
2,2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Uplifted,Happy,Relaxed,Energetic,Creative,Spicy/Herbal,Sage,Woody,
3,3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly,Creative,Hungry,Relaxed,Uplifted,Apricot,Citrus,Grapefruit,
4,4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Happy,Relaxed,Euphoric,Uplifted,Talkative,Citrus,Earthy,Orange,


# 2 - Initial NLP task

In [5]:
#Preprocessing the text in the Description, lower case, taking out special characters and stop words, lemmatization of the words
stopwords = set(STOPWORDS)
df_des=EDA.Description.astype(str)
df_lower_case_des=df_des.apply(lambda x: " ".join(x.lower() for x in x.split())) #lower case
df_special_characters_out_des=df_lower_case_des.map(lambda x: re.sub(r'\W+', ' ',x)) #remove special character
df_stop_words_out_des=df_special_characters_out_des.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords)) #remove stopwords
df_lemma_des=df_stop_words_out_des.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))  #lemmatize
#print the head of the lemmatize description text
print(df_lemma_des.head())

0    100 og 50 50 hybrid strain pack strong punch n...
1    98 aloha white widow especially potent cut whi...
2    1024 sativa dominant hybrid bred spain medical...
3    13 dawgs hybrid g13 chemdawg genetics bred can...
4    known kosher tangie 24k gold 60 indica dominan...
Name: Description, dtype: object


In [6]:
# How many numerical words for each row
df_digits = df_lemma_des.apply(lambda x: len([x for x in x.split() if x.isdigit()]))
print(df_digits.head())
print(df_digits.describe())

0    4
1    2
2    2
3    4
4    3
Name: Description, dtype: int64
count    2349.000000
mean        1.029800
std         1.380985
min         0.000000
25%         0.000000
50%         0.000000
75%         2.000000
max         8.000000
Name: Description, dtype: float64


In [7]:
# Taking out the numbers of each description
df_numeric_out = df_lemma_des.str.replace('\\d+', '')
print(df_numeric_out.head())

0     og   hybrid strain pack strong punch name sup...
1     aloha white widow especially potent cut white...
2     sativa dominant hybrid bred spain medical see...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    known kosher tangie k gold  indica dominant hy...
Name: Description, dtype: object


In [8]:
#Step 1: Identification of Common words
#Step 2: I could but i will not remove the common words at this point
common_words = pd.Series(' '.join(df_stop_words_out_des).split()).value_counts()[:10]
print(common_words)

strain      2627
effects     1469
s           1377
indica      1366
hybrid      1252
sativa      1060
og          1051
kush        1037
dominant     941
aroma        870
dtype: int64


In [9]:
#Step 1: Identification of Rare words
#Step 2: I could but i will not remove the rare words at this point
rare_words = pd.Series(' '.join(df_stop_words_out_des).split()).value_counts()[-10:]
print(rare_words)

outweigh      1
kuato         1
brilliance    1
gladly        1
alexis        1
draping       1
ckc           1
culls         1
botanica      1
godberry      1
dtype: int64


In [10]:
# Now the description text is ready for future analysis, saving the results into a dataframe
df_description_clean=df_numeric_out
df_description_clean.to_excel('../CannaConnect/Dataset/description_clean.xlsx')

In [11]:
# Visualization of the 50 most Frequent words
all_words = df_description_clean.str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = all_words.index.values[2:50],
            y = all_words.values[2:50],
            marker= dict(colorscale='Jet',
                         color = all_words.values[2:100]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 (Uncleaned) Word frequencies in the training dataset'
)

fig = go.Figure(data=data, layout=layout)

plot(fig, filename='basic-bar.html')

'file://C:\\Users\\dastous\\Desktop\\MMAI\\CannaConnect\\basic-bar.html'