# Data Science Workflow

This is a notebook for testing all of the functions to go through an entire data science workflow of running NLP on chosen subreddits.

**Future**
- DF functions should be removed from workflow and imported using a class.
- Multiple example DS outcomes would be nice.

---

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from PIL import Image
import wordcloud
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from pprint import pprint

In [2]:
from helpers import databases
from helpers import dataloader
from helpers import grid_models
from helpers.reddit_functions import Reddit

In [3]:
# subreddit_list = ['css', 'html', 'javascript', 'php', 'perl', 'java', 'datascience', 'machinelearning', 'etl', 'python', 'dataengineering']

## Data Engineers Above Here

## Data Scientists Here ---

In [4]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws']

In [5]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

Connection to SQLite DB successful


In [6]:
# get rid of list items with no data retrieved
subreddit_list = [sub for sub in subreddit_list if sub in df.subreddit.unique()]
subreddit_list

['datascience', 'machinelearning', 'dataengineering', 'python', 'aws']

In [7]:
df = dataloader.subreddit_encoder(df)

Subreddits and codes added: {'aws': 0, 'datascience': 1, 'machinelearning': 2, 'python': 3, 'dataengineering': 4}


In [8]:
df.sample(10)

Unnamed: 0,title,subreddit,date,sub_code
217,I want to learn how to use AWS for Mobile deve...,aws,2020-03-29,0
2306,Scalable Multi-Task Imitation Learning with Au...,machinelearning,2020-03-29,2
382,Using SSM for patching (this is the most confu...,aws,2020-03-29,0
1925,"[R] Unboxing the ""Black Box"": Learning Interpr...",machinelearning,2020-03-29,2
3898,Career Advice to switch to Data engineering.,dataengineering,2020-03-29,4
3660,I made a bot for a sudoku game,python,2020-03-29,3
3124,How would you make it so each square gets bigg...,python,2020-03-29,3
136,URL redirects question /folders,aws,2020-03-29,0
168,git repository only accessible within workspaces?,aws,2020-03-29,0
3028,Python VDE for Chromebook,python,2020-03-29,3


## Make sure doing the train test split right with stratify

https://towardsdatascience.com/3-things-you-need-to-know-before-you-train-test-split-869dfabb7e50

![stratified k fold](https://embed.filekitcdn.com/e/wW49WBPsD4QcY2GCZ4gYa8/mU1tQ3TsTcyz1fkCkiuSSZ?w=800&fit=max)

In [9]:
X = df['title']
y = df['sub_code']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [11]:
useless_words = set(['using', 'help', 'new', 'data', 'science', 'machine', 'learning', 'use', 'need'])

custom_stop_words = ENGLISH_STOP_WORDS.union(subreddit_list, useless_words)

In [12]:
redfun = Reddit()

In [14]:
preprocessors = grid_models.preprocessors
estimators = grid_models.estimators

In [15]:
pprint(preprocessors)

{'count_vec': {'abbr': 'count_vec',
               'name': 'CountVectorizer',
               'pipe_params': {'count_vec__max_df': [0.3, 0.4, 0.5],
                               'count_vec__max_features': [5000],
                               'count_vec__min_df': [4, 5, 6],
                               'count_vec__ngram_range': [(1, 2)],
                               'count_vec__stop_words': ['english']},
               'processor': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)},
 'tfidf': {'abbr': 'tfidf',
           'name': 'TfidVectorizer',
           'pipe_params': {'tfidf__max_features': [5000],
                   

In [16]:
pprint(estimators)

{'knearest': {'abbr': 'knearest',
              'estimator': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'),
              'name': 'K Nearest Neighbors',
              'pipe_params': {'knearest__metric': ['manhattan'],
                              'knearest__n_neighbors': [3, 5, 7]}},
 'logreg': {'abbr': 'logreg',
            'estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False),
            'name': 'Logistic Regression',
            'pipe_params': {'logreg__C': [0.01, 0.1, 1, 3],
                            'logreg__penalty': ['l1', 'l2']}},
 'multinomialnb': {'abbr'

### Compare Subset of Models

In [19]:
esty = {'logreg': estimators['logreg']}

compare_df = redfun.compare_models(X_train, X_test, y_train, y_test, estimators=esty, cv=3, verbose=0)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Compare All Models

In [None]:
compare_df = compare_models(cv=2, verbose=0)

In [20]:
compare_df.sort_values(by='Best Test Score', ascending=False)

Unnamed: 0,Preprocessor,Estimator,Best Params,Best Train Score,Best Test Score,Variance
1,tfidf,logreg,"{'logreg__C': 1, 'logreg__penalty': 'l2', 'tfi...",0.906912,0.741639,18.223725
0,count_vec,logreg,"{'count_vec__max_df': 0.3, 'count_vec__max_fea...",0.892698,0.70903,20.574461


In [21]:
best_model = compare_df.sort_values(by='Best Test Score', ascending=False).iloc[0, :].to_dict()
best_model

{'Preprocessor': 'tfidf',
 'Estimator': 'logreg',
 'Best Params': {'logreg__C': 1,
  'logreg__penalty': 'l2',
  'tfidf__max_features': 5000,
  'tfidf__ngram_range': (1, 1),
  'tfidf__stop_words': 'english',
  'tfidf__strip_accents': None},
 'Best Train Score': 0.9069119286510591,
 'Best Test Score': 0.7416387959866221,
 'Variance': 18.223724646588813}

## Make a new model with the best params from the search

In [22]:
best_pipe = Pipeline([
    (best_model['Preprocessor'], preprocessors[best_model['Preprocessor']]['processor']),
    (best_model['Estimator'], estimators[best_model['Estimator']]['estimator'])
])
best_pipe.set_params(**best_model['Best Params'])
# fit on entire dataset
best_pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=5000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_s

In [23]:
best_pipe_score = best_pipe.score(X, y)
best_pipe_score

0.8986204013377926

### Model Improvement

In [25]:
# baseline
y.value_counts(normalize=True)

0    0.208403
2    0.207985
4    0.204222
3    0.191054
1    0.188336
Name: sub_code, dtype: float64

In [26]:
# how much improvement over baseline
best_pipe_score - y.value_counts(normalize=True)[0]

0.6902173913043478

In [29]:
# how much difference from the best worst model to the best best model
best_pipe_score - min(compare_df['Best Test Score'])

0.1895903010033445

In [30]:
# how much improvement from retraining on entire dataset
best_pipe_score - best_model['Best Test Score']

0.15698160535117056

In [None]:
pipe = Pipeline([('count_vec', CountVectorizer()),(('lr', LogisticRegression()))])

In [None]:
pipe_params = {
                'count_vec__max_features': [4000],
                'count_vec__max_df': [.3],
                'count_vec__ngram_range': [(1,2)],
                'count_vec__stop_words': [custom_stop_words],
                'count_vec__min_df': [3],
                'lr__penalty': ['l2'],
                'lr__C': [5]
                }

In [None]:
model = GridSearchCV(pipe, param_grid=pipe_params, cv=5, verbose=1, n_jobs=-1)

In [None]:
model.fit(X_train, y_train)

In [None]:
def scores_info(model):    
    print('Best Params:')
    best_params = model.best_params_
    best_params.pop('count_vec__stop_words', 'Aldready deleted')
    pprint(best_params)
    print()
    print(f'Best Modeling Score: {model.best_score_}')
    print()
    print(f'Train Score: {model.score(X_train, y_train)}')
    print()
    print(f'Test Score: {model.score(X_test, y_test)}')

In [None]:
scores_info(model)

In [None]:
### FIX ###
# Hardcoded count_vec
# Test for tfidf, may have to hardcode into ds_workflows

In [None]:
# model is GridSearchCV
# best_estimator is Pipeline
# named_steps is the steps in the pipeline
# count_vec is the FITTED 

features_data = model.best_estimator_.named_steps.count_vec.transform(X).toarray()
features_columns = model.best_estimator_.named_steps.count_vec.get_feature_names()
features_df = pd.DataFrame(data=features_data, columns=features_columns)

In [None]:
# CHECK ### does this function work with tfidf, or just count vec?
def plot_most_common(df, features_df, subreddit_list=subreddit_list, num_features=20, standardize=False, include_combined=False):
    '''
    Plots the most common features for each subreddit in the DataFrame

    Parameters:

    df: original DataFrame

    features_df: should be output from transformer on df

        Example:
        features_df = pd.DataFrame(
                                data={transformer}.transform(X).toarray(),
                                columns={transformer}.get_feature_names())

    num_features: number of most common features to plot for each subreddit

    standardize: put all of the plots on the same scale

    combined: include a plot of the most common features of all of the subreddits combined

    Returns:

    plots

    '''

    fig, ax = plt.subplots(ncols=1,
                           nrows=len(subreddit_list) + int(1 if include_combined else 0),
                           figsize=(15, num_features/1.3*len(subreddit_list)))

    for subplot_idx, sub in enumerate(subreddit_list):
        sub_features = features_df.loc[df['subreddit'] == sub]
        sub_top_words = sub_features.sum().sort_values(ascending=False).head(num_features)[::-1]
        sub_top_words.plot(kind='barh', ax=ax[subplot_idx])
        ax[subplot_idx].set_title(f'{num_features} Most Common Words for {sub.upper()}', fontsize=16)
        
        if standardize:
            max_occurence = features_df.sum().max()*1.02
            ax[subplot_idx].set_xlim(0, max_occurence)

    if include_combined:
        most_common = features_df.sum().sort_values(ascending=False).head(num_features)[::-1]
        most_common.plot(kind='barh', ax=ax[subplot_idx+1])
        ax[subplot_idx+1].set_title(f'{num_features} Most Common Words for ({", ".join(subreddit_list).upper()})')
        
        if standardize:
            ax[subplot_idx+1].set_xlim(0, max_occurence)
    
    plt.tight_layout(h_pad=7)

In [None]:
plot_most_common(df, features_df, num_features=15, include_combined=True, standardize=True)

## Feature Importance Coefficients

In [None]:
###HELP### I don't think this is working right
###HELP### coefficients don't make sense for the entire dataset, would need to do one for each thing

In [None]:
###NOTE### coefficients only for lr and etc etc etc...
###NOTE### coefficients only for two subreddits

### FIX ### look at the coef_ portion of the new single model instead of the gridsearch


# [-1][1] for last step (estimator)(instantiation)
# coef_[0]because I don't know why
coefs = model.best_estimator_.steps[-1][1].coef_[0]




plt.figure(figsize=(16, 12))
plt.title('Feature Importance (Coefficients)', fontsize=20)
plt.ylabel('Features', fontsize=18)
plt.xlabel('(Abs) Coefficients', fontsize=18)

coef_df = pd.DataFrame(data=[coefs], columns=features_columns).T
coef_df['abs_coef'] = coef_df[0].abs()
coef_df.sort_values('abs_coef', ascending=False)[0].head(15).plot(kind='barh');

# coef_kill = coef_df.sort_values('abs_coef', ascending=False)[0].head(500).index

# coef_kill


# my_stops.extend(coef_kill)

In [None]:
from itertools import combinations

In [None]:
def make_pairs(subreddit_list):
    '''
    Makes combination pairs of subreddits from subreddit_list
    '''
    if len(subreddit_list) > 2:
            return list(combinations(subreddit_list, 2))
    return subreddit_list

In [None]:
pairs = make_pairs(subreddit_list)
pairs

In [None]:
def plot_most_common_pairs(df, features_df, pairs, num_features=20):
    '''
    Plots the most common features for each subreddit in the DataFrame
    
    Parameters:
    
    df: original DataFrame
    
    features_df: should be output from transformer on df
        
        Example:
        features_df = pd.DataFrame(
                                data={transformer}.transform(X).toarray(),
                                columns={transformer}.get_feature_names())
    
    num_features: number of most common features to plot for each subreddit
    
    Returns:
    
    plots
    
    '''
    fig, ax = plt.subplots(ncols=2, 
                           nrows=len(pairs), 
                           figsize=(16,num_features/3*len(pairs)))

    for i, pair in enumerate(pairs):

        # features for each pair
        feats_0 = features_df.loc[(df['subreddit'] == pair[0])]
        feats_1 = features_df.loc[(df['subreddit'] == pair[1])]
        # combined
        common_feats = feats_0.append(feats_1)
        # this is the most common between the two
        most_common = common_feats.sum().sort_values(ascending=False).head(num_features)[::-1]
        # plot
        feats_0[most_common.index].sum().plot.barh(ax=ax[i, 0], color='navy')
        feats_1[most_common.index].sum().plot.barh(ax=ax[i, 1], color='orange')
        ax[i, 0].set_title(f'Top {num_features} - {pair} \nSub: {pair[0].upper()}', fontsize=16, wrap=True)
        ax[i, 1].set_title(f'Top {num_features} - {pair} \nSub: {pair[1].upper()}', fontsize=16, wrap=True)
        max_occurence = common_feats.sum().max()*1.02
        ax[i, 0].set_xlim(0,max_occurence)
        ax[i, 1].set_xlim(0,max_occurence)
    plt.tight_layout()

In [None]:
plot_most_common_pairs(df, features_df, pairs)

In [None]:
most_common = features_df.sum().sort_values(ascending=False).head(20)[::-1]
groups = features_df.groupby(df['subreddit']).sum()[most_common.index].T.head(20)

In [None]:
fig, ax = plt.subplots(nrows=2, figsize=(18,20))

groups.plot.bar(ax=ax[0], width=.8, fontsize=15)
ax[0].set_title('20 Most Common Words', fontsize=20)
ax[0].set_ylabel('# of Occurences', fontsize=15)
ax[0].legend(fontsize=15, fancybox=True, framealpha=1, shadow=True, borderpad=1)

groups.plot(kind='bar', ax=ax[1], width=.35, fontsize=15, stacked=True)
ax[1].set_title('20 Most Common Words', fontsize=20)
ax[1].set_ylabel('# of Occurences', fontsize=15)
ax[1].legend(fontsize=15, fancybox=True, framealpha=1, shadow=True, borderpad=1)


plt.tight_layout(h_pad=10);

In [None]:
# NOTE ### this does not use the X value inputted when using split...


def make_cloud(X, height=300, width=800, max_words=100, split=None, labels=None, stopwords=None, colormap='viridis', background_color='black'):
    '''
    Inputs:
    X: text input
    height: height of each wordcloud
    width: width of each wordcloud
    max_words: max words for each wordcloud
    split: if True, wordcloud for each subreddit
    labels: must provide list of labels if split=True, to generate a wordcloud for each label
    stopwords: usually these are the same stopwords used by the tranformer (CountVectorizer or Tfidf)
    colormap: any choice from matplotlib gallery.  Find them with plt.cm.datad
        'random': picks a random colormap for each cloud.
    '''

    colormaps = [m for m in plt.cm.datad if not m.endswith("_r")]
    wc = wordcloud.WordCloud(max_words=max_words,
                             width=width,
                             height=height,
                             background_color=background_color,
                             colormap=np.random.choice(
                                 colormaps) if colormap == 'random' else colormap,
                             stopwords=stopwords)
    if split:
        for label in labels:
            cloud = wc.generate(
                df[df['subreddit'] == label]['title'].str.cat())
            plt.figure(figsize=(width/100, height*len(labels)/100), dpi=100)
            plt.title(label.upper(), fontdict={'fontsize': 15})
            plt.axis("off")
            plt.imshow(cloud.to_image(), interpolation='bilinear')

    else:
        cloud = wc.generate(X.str.cat())
        return cloud.to_image()

In [None]:
make_cloud(X, stopwords=custom_stop_words, colormap='rainbow')

In [None]:
make_cloud(X, split=True, labels=subreddit_list, stopwords=custom_stop_words, colormap='random', background_color='black')

In [None]:
img = Image.open('../images/reddit03.png')
gray = np.array(img.convert('L'))
mask = np.where(gray < 200, 255, 0)

wc = wordcloud.WordCloud(background_color='white', 
                         max_words=500, 
                         mask=mask, 
                         colormap='Reds',
                         contour_color='orangered',
                         contour_width=1,
                         stopwords=custom_stop_words)
wc.generate(X.str.cat())
plt.figure(figsize=(12,12))
plt.imshow(wc, interpolation='bilinear')
plt.axis(False);

In [None]:
img2 = Image.open('../images/reddit02.jpg')
gray2 = np.array(img2.convert('L'))
mask2 = np.where(gray2 < 200, 255, 0)

wc2 = wordcloud.WordCloud(background_color='white', 
                         max_words=1000, 
                         mask=mask2, 
                         colormap='Reds',
                         contour_color='orangered',
                         contour_width=1,
                         stopwords=custom_stop_words)
wc2.generate(X.str.cat())
plt.figure(figsize=(8,12))
plt.imshow(wc2, interpolation='bilinear')
plt.axis(False)

plt.show()

In [None]:
mask = np.array(Image.open("../images/reddit06.jpg"))
colorcloud = wordcloud.WordCloud(stopwords=custom_stop_words,
                                 background_color="white",
                                 mode="RGBA",
                                 max_words=1000,
                                 mask=mask)
colorcloud.generate(X.str.cat())

image_colors = wordcloud.ImageColorGenerator(mask)
plt.figure(figsize=[7, 7])

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

ax1.imshow(mask)
ax1.axis(False)

ax2.imshow(colorcloud.recolor(color_func=image_colors),
           interpolation="bilinear")
ax2.axis(False)

plt.show();

## Confusion Matrix

In [None]:
y_pred = model.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
df_cm = pd.DataFrame(cm, columns=subreddit_list, index=subreddit_list)
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
df_cm

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sns.set(font_scale=2)
sns.heatmap(df_cm, annot=True, fmt="d", cbar=False, cmap='Greens', ax=ax, )
fontdict={'fontsize': 16}
ax.set_yticklabels(labels=subreddit_list, rotation='horizontal', fontdict=fontdict)
ax.set_xticklabels(labels=subreddit_list, rotation=20, fontdict=fontdict)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')

In [None]:
mcm = multilabel_confusion_matrix(y_test, y_pred)
mtn = mcm[:, 0, 0]
mtp = mcm[:, 1, 1]
mfn = mcm[:, 1, 0]
mfp = mcm[:, 0, 1]
print(mcm)

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=len(subreddit_list),
                       figsize=(12, 6*len(subreddit_list)))

for i, cm in enumerate(mcm):
    df_cm = pd.DataFrame(cm)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    sns.heatmap(df_cm, annot=True, fmt="d", cbar=False,
                cmap='Purples', ax=ax[i, 0])

    ax[i, 0].set_yticklabels(labels=[1, 0], rotation='horizontal')
    ax[i, 0].set_xticklabels(labels=[1, 0])
    ax[i, 0].xaxis.tick_top()
    ax[i, 0].xaxis.set_label_position('top')
    ax[i, 0].set_title(subreddit_list[i].upper())

    tn, fp, fn, tp = cm.ravel()
    specif = tn / (tn + fp)

    sens = tp / (tp + fn)
    box_text = f'''Subreddit: {subreddit_list[i].upper()}\n\nSpecificity: {round(specif,4)}\n\nSensitivity: {round(sens,4)}'''
    ax[i, 1].text(0.5, 0.5, box_text, horizontalalignment='center',
                  verticalalignment='center', fontsize=24)
    ax[i, 1].set_axis_off()


plt.tight_layout()

In [None]:
print(classification_report(y_test, y_pred, digits=3, target_names=subreddit_list))

### ROC AUC Score

<h1>TODO:</h1>

1. Make it have the roc for each sub, have to get into the original df where subname equals indexes?
2. Plot confusion matrix
3. Make a notebook to test the confusion matrixes one by one with each individual model.

In [None]:
y_test.index

In [None]:
auc = roc_auc_score(y_test, y_pred)
auc

### ROC AUC Curve

In [None]:
probs = model.predict_proba(X_test)[:,1]

In [None]:
# not multiclass
fpr, tpr, thresholds = roc_curve(y_test, probs)

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, label=f'ROC curve (area = {auc})', color='r', marker='D')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)', size=16)
plt.ylabel('True Positive Rate (Sensitivity)', size=16)
plt.title('ROC Curve', size=20)
plt.legend(fontsize=14);

# Histograms!

In [None]:
x1 = np.random.normal(0, 0.8, 1000) x2 = np.random.normal(-2, 1, 1000) x3 = np.random.normal(3, 2, 1000)
kwargs = dict(histtype='stepfilled', alpha=0.3, normed=True, bins=40)
plt.hist(x1, **kwargs) plt.hist(x2, **kwargs) plt.hist(x3, **kwargs);

In [None]:
# Create figure.
plt.figure(figsize = (10,7))

# Create histogram of observations.
plt.hist(probs, bins=25, color='b')

# Label axes.
plt.title('Distribution of P(Outcome = 1)', fontsize=22)
plt.ylabel('Frequency', fontsize=18)
plt.xlabel('Predicted Probability that Outcome = 1', fontsize=18);

In [None]:
# Create figure.
plt.figure(figsize = (10,7))

# Create two histograms of observations.
hst0 = plt.hist(probs[y_test == 0],
         bins=25,
         color='b',
         alpha = 0.6,
         label='Technology',)

hst1 = plt.hist(probs[y_test == 1],
         bins=25,
         color='orange',
         alpha = 0.6,
         label='Science')

# Add vertical line at P(Outcome = 1) = 0.5.
plt.vlines(x=0.5,
           ymin = 0,
           ymax = max(hst1[0].max(), hst0[0].max()), # Max of the two highest respective hist values
           color='r',
           linestyle = '--')

# Label axes.
plt.title('Distribution of P(Science)', fontsize=22)
plt.ylabel('Frequency', fontsize=18)
plt.xlabel('Predicted Probability', fontsize=18)

# Create legend.
plt.legend(fontsize=20);

# Thanks to Matt Brems for the colorful graphs! :)