# Data Science Wordclouds

This is a notebook for testing all of the functions to go through an entire data science workflow of running NLP on chosen subreddits.

**Future**
- DF functions should be removed from workflow and imported using a class.
- Multiple example DS outcomes would be nice.

---

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from PIL import Image
import wordcloud
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from pprint import pprint

In [5]:
from util import databases
from util import dataloader
from util import grid_models
from util.reddit_functions import Reddit

ModuleNotFoundError: No module named 'databases'

In [None]:
!pwd

In [None]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws']

In [None]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

In [None]:
# get rid of list items with no data retrieved
subreddit_list = [sub for sub in subreddit_list if sub in df.subreddit.unique()]
subreddit_list

In [None]:
df.sample(10)

In [None]:
useless_words = set(['using', 'lambda', 's3', 'does', 'looking', 'help', 'new', 'data', 'science', 'machine', 'learning', 'use', 'need', 'engineer', 'engineering'])

custom_stop_words = ENGLISH_STOP_WORDS.union(subreddit_list, useless_words)

In [None]:
# NOTE ### this does not use the X value inputted when using split...


def make_cloud(text_column, labels_column=None, height=300, width=800, max_words=100, split=None, stopwords=None, colormap='viridis', background_color='black'):
    '''
    Inputs:
    X: text input
    height: height of each wordcloud
    width: width of each wordcloud
    max_words: max words for each wordcloud
    split: if True, wordcloud for each subreddit
    labels: must provide list of labels if split=True, to generate a wordcloud for each label
    stopwords: usually these are the same stopwords used by the tranformer (CountVectorizer or Tfidf)
    colormap: any choice from matplotlib gallery.  Find them with plt.cm.datad
        'random': picks a random colormap for each cloud.
    '''

    colormaps = [m for m in plt.cm.datad if not m.endswith("_r")]
    wc = wordcloud.WordCloud(max_words=max_words,
                             width=width,
                             height=height,
                             background_color=background_color,
                             colormap=np.random.choice(
                                 colormaps) if colormap == 'random' else colormap,
                             stopwords=stopwords)
    if split:
        unique_labels = df[labels_column].unique()
        for label in unique_labels:
            cloud = wc.generate(
                df[df[labels_column] == label][text_column].str.cat())
            plt.figure(figsize=(width/100, height*len(unique_labels)/100), dpi=100)
            plt.title(label.upper(), fontdict={'fontsize': 15})
            plt.axis("off")
            plt.imshow(cloud.to_image(), interpolation='bilinear')

    else:
        cloud = wc.generate(df[text_column].str.cat())
        return cloud.to_image()

In [None]:
make_cloud(text_column='title', stopwords=custom_stop_words, colormap='rainbow')

In [None]:
make_cloud(text_column='title', labels_column='subreddit', split=True, stopwords=custom_stop_words, colormap='random', background_color='black')

In [None]:
img = Image.open('../images/reddit03.png')
gray = np.array(img.convert('L'))
mask = np.where(gray < 200, 255, 0)

wc = wordcloud.WordCloud(background_color='white', 
                         max_words=500, 
                         mask=mask, 
                         colormap='Reds',
                         contour_color='orangered',
                         contour_width=1,
                         stopwords=custom_stop_words)
wc.generate(df[text_column].str.cat())
plt.figure(figsize=(12,12))
plt.imshow(wc, interpolation='bilinear')
plt.axis(False);

In [None]:
img2 = Image.open('../images/reddit02.jpg')
gray2 = np.array(img2.convert('L'))
mask2 = np.where(gray2 < 200, 255, 0)

wc2 = wordcloud.WordCloud(background_color='white', 
                         max_words=1000, 
                         mask=mask2, 
                         colormap='Reds',
                         contour_color='orangered',
                         contour_width=1,
                         stopwords=custom_stop_words)
wc2.generate(df[text_column].str.cat())
plt.figure(figsize=(8,12))
plt.imshow(wc2, interpolation='bilinear')
plt.axis(False)

plt.show()

In [None]:
mask = np.array(Image.open("../images/reddit06.jpg"))
colorcloud = wordcloud.WordCloud(stopwords=custom_stop_words,
                                 background_color="white",
                                 mode="RGBA",
                                 max_words=1000,
                                 mask=mask)
colorcloud.generate(df[text_column].str.cat())

image_colors = wordcloud.ImageColorGenerator(mask)
plt.figure(figsize=[7, 7])

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

ax1.imshow(mask)
ax1.axis(False)

ax2.imshow(colorcloud.recolor(color_func=image_colors),
           interpolation="bilinear")
ax2.axis(False)

plt.show();