# Data Science Workflow

This is a notebook for testing all of the functions to go through an entire data science workflow of running NLP on chosen subreddits.

**Future**
- DF functions should be removed from workflow and imported using a class.
- Multiple example DS outcomes would be nice.

---

### This is gridsearch for each model

https://stackabuse.com/grid-search-optimization-algorithm-in-python/

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from pprint import pprint
from xgboost import XGBClassifier

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
from helpers import databases
from helpers import dataloader
from helpers import grid_models
from helpers.reddit_functions import Reddit

In [3]:
# subreddit_list = ['css', 'html', 'javascript', 'php', 'perl', 'java', 'datascience', 'machinelearning', 'etl', 'python', 'dataengineering']

In [4]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws']

In [5]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

Connection to SQLite DB successful


In [6]:
# get rid of list items with no data retrieved
subreddit_list = [sub for sub in subreddit_list if sub in df.subreddit.unique()]
subreddit_list

['datascience', 'machinelearning', 'dataengineering', 'python', 'aws']

In [7]:
df = dataloader.subreddit_encoder(df)

Subreddits and codes added: {'aws': 0, 'datascience': 1, 'machinelearning': 2, 'python': 3, 'dataengineering': 4}


In [8]:
df.sample(10)

Unnamed: 0,title,subreddit,date,sub_code
7086,[D][R] Persistent Memory Cloud Compute-- Early...,machinelearning,2020-04-02,2
5338,"Recommended CI/CD for a site using CloudFront,...",aws,2020-04-02,0
3257,Codecademy is giving its pro subscription for ...,python,2020-03-29,3
8766,What do you use for data analysis?,dataengineering,2020-04-02,4
6507,Data Science Project Suggestions,datascience,2020-04-02,1
3190,A safe Python twitter bot to post tweet using ...,python,2020-03-29,3
8842,A Data Engineer's Naive Foray Into Data Science,dataengineering,2020-04-02,4
7796,Why can't I install anaconda ?,python,2020-04-02,3
2136,[R] Kaggle Competition on COVID19 Dataset by A...,machinelearning,2020-03-29,2
7036,[D] I need some help with deploying an LSTM mo...,machinelearning,2020-04-02,2


In [9]:
X = df['title']
y = df['sub_code']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [11]:
useless_words = set(['using', 'help', 'new', 'data', 'science', 'machine', 'learning', 'use', 'need'])

custom_stop_words = ENGLISH_STOP_WORDS.union(subreddit_list, useless_words)

In [12]:
redfuncs = Reddit()

In [13]:
tfidf = TfidfVectorizer(stop_words=custom_stop_words)

In [35]:
mlp = MLPClassifier()

mlp_pipe = Pipeline(
    [('tfidf', tfidf),
     ('mlp', mlp)])

mlp_pipe_params = {
    "tfidf__ngram_range": [(1, 2),(2, 2),(2,3)],
#     "tfidf__max_features": [1000, 3000, 5000],
    "tfidf__max_df": [.75, .8, .85, .9],
    "tfidf__use_idf": [True],
    "tfidf__norm": ["l1", "l2"],
#     "mlp__hidden_layer_sizes": [50, 100, 200]
}


mlp_model = GridSearchCV(mlp_pipe, param_grid=mlp_pipe_params, cv=3, verbose=2, n_jobs=-1)

mlp_model.fit(X_train, y_train)
print(f'Train Score: {mlp_model.score(X_train, y_train)}')
print(f'Test Score: {mlp_model.score(X_test, y_test)}')
mlp_model.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 61.4min


KeyboardInterrupt: 

In [22]:
xgb = XGBClassifier()

xgb_pipe = Pipeline(
    [('tfidf', tfidf),
     ('xgb', xgb)])

xgb_pipe_params = {
    "tfidf__ngram_range": [(1, 2)],
    "tfidf__max_features": [2000, 3000, 4000],
    "tfidf__max_df": [.5, .6, .7, .75],
    "tfidf__use_idf": [True],
    "tfidf__norm": ["l2"],
    "xgb__hidden_layer_sizes": [10, 25, 50],
    "xgb__n_estimators": [50, 100, 200],
    "xgb__max_depth": [5, 10, 20]
}


xgb_model = GridSearchCV(xgb_pipe, param_grid=xgb_pipe_params, cv=2, verbose=2, n_jobs=-1)

xgb_model.fit(X_train, y_train)
print(f'Train Score: {xgb_model.score(X_train, y_train)}')
print(f'Test Score: {xgb_model.score(X_test, y_test)}')
xgb_model.best_estimator_

Fitting 2 folds for each of 864 candidates, totalling 1728 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed: 17.5min finished


Train Score: 0.8043022768543092
Test Score: 0.6941767909509845


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=3000,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterward...
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               hidden_layer_sizes=50, learning_rate=0.1,
                               max_delta_step=0, ma

In [34]:
onevrest = OneVsRestClassifier(RandomForestClassifier())

onevrest_pipe = Pipeline(
    [('tfidf', tfidf),
     ('onevrest', onevrest)])


onevrest_pipe_params = {
    "tfidf__ngram_range": [(1, 2)],
    "tfidf__max_features": [4500, 5000, 5500],
    "tfidf__max_df": [.75, .8, .85],
    "tfidf__use_idf": [True],
    "tfidf__norm": ["l2"],
    "onevrest__estimator__n_estimators": [200, 300] 
}


onevrest_model = GridSearchCV(onevrest_pipe, param_grid=onevrest_pipe_params, cv=3, verbose=2, n_jobs=-1)

onevrest_model.fit(X_train, y_train)
print(f'Train Score: {onevrest_model.score(X_train, y_train)}')
print(f'Test Score: {onevrest_model.score(X_test, y_test)}')
onevrest_model.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 26.8min finished


Train Score: 0.9870093588490012
Test Score: 0.8722245496439045


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.8, max_features=5000,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterward...
                                                                      ccp_alpha=0.0,
                                                                      class_weight=None,
                                                                      criterion='g

In [None]:
Train Score: 0.9870093588490012
Test Score: 0.8722245496439045

## Scoring

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

In [None]:
print(predictions != y_test)

In [None]:
# model is GridSearchCV
# best_estimator is Pipeline
# named_steps is the steps in the pipeline
# count_vec is the FITTED 

features_data = model.named_steps.cv.transform(X_train).toarray()
features_columns = model.named_steps.cv.get_feature_names()
features_df = pd.DataFrame(data=features_data, columns=features_columns)

In [None]:
# CHECK ### does this function work with tfidf, or just count vec?
def plot_most_common(df, features_df, subreddit_list=subreddit_list, num_features=20, standardize=False, include_combined=False):
    '''
    Plots the most common features for each subreddit in the DataFrame

    Parameters:

    df: original DataFrame

    features_df: should be output from transformer on df

        Example:
        features_df = pd.DataFrame(
                                data={transformer}.transform(X).toarray(),
                                columns={transformer}.get_feature_names())

    num_features: number of most common features to plot for each subreddit

    standardize: put all of the plots on the same scale

    combined: include a plot of the most common features of all of the subreddits combined

    Returns:

    plots

    '''

    fig, ax = plt.subplots(ncols=1,
                           nrows=len(subreddit_list) + int(1 if include_combined else 0),
                           figsize=(15, num_features/1.3*len(subreddit_list)))

    for subplot_idx, sub in enumerate(subreddit_list):
        sub_features = features_df.loc[df['subreddit'] == sub]
        sub_top_words = sub_features.sum().sort_values(ascending=False).head(num_features)[::-1]
        sub_top_words.plot(kind='barh', ax=ax[subplot_idx])
        ax[subplot_idx].set_title(f'{num_features} Most Common Words for {sub.upper()}', fontsize=16)
        
        if standardize:
            max_occurence = features_df.sum().max()*1.02
            ax[subplot_idx].set_xlim(0, max_occurence)

    if include_combined:
        most_common = features_df.sum().sort_values(ascending=False).head(num_features)[::-1]
        most_common.plot(kind='barh', ax=ax[subplot_idx+1])
        ax[subplot_idx+1].set_title(f'{num_features} Most Common Words for ({", ".join(subreddit_list).upper()})')
        
        if standardize:
            ax[subplot_idx+1].set_xlim(0, max_occurence)
    
    plt.tight_layout(h_pad=7)

In [None]:
plot_most_common(df, features_df, num_features=15)

## Feature Importance Coefficients

In [None]:
###HELP### I don't think this is working right
###HELP### coefficients don't make sense for the entire dataset, would need to do one for each thing

In [None]:
###NOTE### coefficients only for lr and etc etc etc...
###NOTE### coefficients only for two subreddits

### FIX ### look at the coef_ portion of the new single model instead of the gridsearch


# [-1][1] for last step (estimator)(instantiation)
# coef_[0]because I don't know why
coefs = model.best_estimator_.steps[-1][1].coef_[0]




plt.figure(figsize=(16, 12))
plt.title('Feature Importance (Coefficients)', fontsize=20)
plt.ylabel('Features', fontsize=18)
plt.xlabel('(Abs) Coefficients', fontsize=18)

coef_df = pd.DataFrame(data=[coefs], columns=features_columns).T
coef_df['abs_coef'] = coef_df[0].abs()
coef_df.sort_values('abs_coef', ascending=False)[0].head(15).plot(kind='barh');

# coef_kill = coef_df.sort_values('abs_coef', ascending=False)[0].head(500).index

# coef_kill


# my_stops.extend(coef_kill)

In [None]:
from itertools import combinations

In [None]:
def make_pairs(subreddit_list):
    '''
    Makes combination pairs of subreddits from subreddit_list
    '''
    if len(subreddit_list) > 2:
            return list(combinations(subreddit_list, 2))
    return subreddit_list

In [None]:
pairs = make_pairs(subreddit_list)
pairs

In [None]:
def plot_most_common_pairs(df, features_df, pairs, num_features=20):
    '''
    Plots the most common features for each subreddit in the DataFrame
    
    Parameters:
    
    df: original DataFrame
    
    features_df: should be output from transformer on df
        
        Example:
        features_df = pd.DataFrame(
                                data={transformer}.transform(X).toarray(),
                                columns={transformer}.get_feature_names())
    
    num_features: number of most common features to plot for each subreddit
    
    Returns:
    
    plots
    
    '''
    fig, ax = plt.subplots(ncols=2, 
                           nrows=len(pairs), 
                           figsize=(16,num_features/3*len(pairs)))

    for i, pair in enumerate(pairs):

        # features for each pair
        feats_0 = features_df.loc[(df['subreddit'] == pair[0])]
        feats_1 = features_df.loc[(df['subreddit'] == pair[1])]
        # combined
        common_feats = feats_0.append(feats_1)
        # this is the most common between the two
        most_common = common_feats.sum().sort_values(ascending=False).head(num_features)[::-1]
        # plot
        feats_0[most_common.index].sum().plot.barh(ax=ax[i, 0], color='navy')
        feats_1[most_common.index].sum().plot.barh(ax=ax[i, 1], color='orange')
        ax[i, 0].set_title(f'Top {num_features} - {pair} \nSub: {pair[0].upper()}', fontsize=16, wrap=True)
        ax[i, 1].set_title(f'Top {num_features} - {pair} \nSub: {pair[1].upper()}', fontsize=16, wrap=True)
        max_occurence = common_feats.sum().max()*1.02
        ax[i, 0].set_xlim(0,max_occurence)
        ax[i, 1].set_xlim(0,max_occurence)
    plt.tight_layout()

In [None]:
plot_most_common_pairs(df, features_df, pairs)

In [None]:
most_common = features_df.sum().sort_values(ascending=False).head(20)[::-1]
groups = features_df.groupby(df['subreddit']).sum()[most_common.index].T.head(20)

In [None]:
fig, ax = plt.subplots(nrows=2, figsize=(18,20))

groups.plot.bar(ax=ax[0], width=.8, fontsize=15)
ax[0].set_title('20 Most Common Words', fontsize=20)
ax[0].set_ylabel('# of Occurences', fontsize=15)
ax[0].legend(fontsize=15, fancybox=True, framealpha=1, shadow=True, borderpad=1)

groups.plot(kind='bar', ax=ax[1], width=.35, fontsize=15, stacked=True)
ax[1].set_title('20 Most Common Words', fontsize=20)
ax[1].set_ylabel('# of Occurences', fontsize=15)
ax[1].legend(fontsize=15, fancybox=True, framealpha=1, shadow=True, borderpad=1)


plt.tight_layout(h_pad=10);

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
df_cm = pd.DataFrame(cm, columns=subreddit_list, index=subreddit_list)
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
df_cm

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sns.set(font_scale=2)
sns.heatmap(df_cm, annot=True, fmt="d", cbar=False, cmap='Greens', ax=ax, )
fontdict={'fontsize': 16}
ax.set_yticklabels(labels=subreddit_list, rotation='horizontal', fontdict=fontdict)
ax.set_xticklabels(labels=subreddit_list, rotation=20, fontdict=fontdict)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')

In [None]:
mcm = multilabel_confusion_matrix(y_test, y_pred)
mtn = mcm[:, 0, 0]
mtp = mcm[:, 1, 1]
mfn = mcm[:, 1, 0]
mfp = mcm[:, 0, 1]
print(mcm)

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=len(subreddit_list),
                       figsize=(12, 6*len(subreddit_list)))

for i, cm in enumerate(mcm):
    df_cm = pd.DataFrame(cm)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    sns.heatmap(df_cm, annot=True, fmt="d", cbar=False,
                cmap='Purples', ax=ax[i, 0])

    ax[i, 0].set_yticklabels(labels=[1, 0], rotation='horizontal')
    ax[i, 0].set_xticklabels(labels=[1, 0])
    ax[i, 0].xaxis.tick_top()
    ax[i, 0].xaxis.set_label_position('top')
    ax[i, 0].set_title(subreddit_list[i].upper())

    tn, fp, fn, tp = cm.ravel()
    specif = tn / (tn + fp)

    sens = tp / (tp + fn)
    box_text = f'''Subreddit: {subreddit_list[i].upper()}\n\nSpecificity: {round(specif,4)}\n\nSensitivity: {round(sens,4)}'''
    ax[i, 1].text(0.5, 0.5, box_text, horizontalalignment='center',
                  verticalalignment='center', fontsize=24)
    ax[i, 1].set_axis_off()


plt.tight_layout()

In [None]:
print(classification_report(y_test, y_pred, digits=3, target_names=subreddit_list))

### ROC AUC Score

<h1>TODO:</h1>

1. Make it have the roc for each sub, have to get into the original df where subname equals indexes?
2. Plot confusion matrix
3. Make a notebook to test the confusion matrixes one by one with each individual model.

In [None]:
prob_df = pd.DataFrame(y_prob, columns=subreddit_list)
prob_df.round(6)

In [None]:
auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
auc

### ROC AUC Curve

In [None]:
# not multiclass
fpr, tpr, thresholds = roc_curve(y_test, probs)

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, label=f'ROC curve (area = {auc})', color='r', marker='D')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)', size=16)
plt.ylabel('True Positive Rate (Sensitivity)', size=16)
plt.title('ROC Curve', size=20)
plt.legend(fontsize=14);

### multiple roc curve

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
disp = plot_roc_curve(lr, X_text, y_test)
plot_roc_curve(dt, X_test, y_test, ax=disp.ax_);
plot_roc_curve(dt, X_test, y_test, ax=disp.ax_);

# Histograms!

In [None]:
x1 = np.random.normal(0, 0.8, 1000) x2 = np.random.normal(-2, 1, 1000) x3 = np.random.normal(3, 2, 1000)
kwargs = dict(histtype='stepfilled', alpha=0.3, normed=True, bins=40)
plt.hist(x1, **kwargs) plt.hist(x2, **kwargs) plt.hist(x3, **kwargs);

In [None]:
# Create figure.
plt.figure(figsize = (10,7))

# Create histogram of observations.
plt.hist(probs, bins=25, color='b')

# Label axes.
plt.title('Distribution of P(Outcome = 1)', fontsize=22)
plt.ylabel('Frequency', fontsize=18)
plt.xlabel('Predicted Probability that Outcome = 1', fontsize=18);

In [None]:
# Create figure.
plt.figure(figsize = (10,7))

# Create two histograms of observations.
hst0 = plt.hist(probs[y_test == 0],
         bins=25,
         color='b',
         alpha = 0.6,
         label='Technology',)

hst1 = plt.hist(probs[y_test == 1],
         bins=25,
         color='orange',
         alpha = 0.6,
         label='Science')

# Add vertical line at P(Outcome = 1) = 0.5.
plt.vlines(x=0.5,
           ymin = 0,
           ymax = max(hst1[0].max(), hst0[0].max()), # Max of the two highest respective hist values
           color='r',
           linestyle = '--')

# Label axes.
plt.title('Distribution of P(Science)', fontsize=22)
plt.ylabel('Frequency', fontsize=18)
plt.xlabel('Predicted Probability', fontsize=18)

# Create legend.
plt.legend(fontsize=20);

# Thanks to Matt Brems for the colorful graphs! :)