### Imports

In [4]:
import pandas as pd
import numpy as np
import requests
import json
import time
import datetime
from time import sleep
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import wordcloud
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
import warnings
warnings.filterwarnings("ignore") # shhhhhhhh

---

# Make Model Function

# Dictionary of Models and Preprocessors

In [107]:
# preprocessors

count_vec = {
    'name': 'CountVectorizer',
    'abbr': 'count_vec',
    'processor': CountVectorizer(),
    'pipe_params' : {
                'count_vec__max_features': [5000],
                'count_vec__max_df': [.3,.4,.5],
                'count_vec__ngram_range': [(1,2)],
                'count_vec__stop_words': ['english'],
                'count_vec__min_df': [4,5,6]}
}


tfidf = {
    'name': 'TfidVectorizer',
    'abbr': 'tfidf',
    'processor': TfidfVectorizer(),
    'pipe_params' : {
                'tfidf__strip_accents': [None],
                'tfidf__stop_words': ['english'],
                'tfidf__ngram_range': [(1, 1)],                   
                'tfidf__max_features': [5000]}
}

    
    
# Estimators
    
lr = {
    'name': 'Logistic Regression',
    'abbr': 'lr',
    'estimator': LogisticRegression(),
    'pipe_params' : {
                'lr__penalty': ['l1','l2'],
                'lr__C': [.01,.1,1,3]}
}


rf = {
    'name': 'Random Forest',
    'abbr': 'rf',
    'estimator': RandomForestClassifier(),
    'pipe_params' : {
                'rf__n_estimators': [100, 200, 300],
                'rf__max_depth': [200],
                'rf__min_samples_leaf': [1,2,3],
                'rf__min_samples_split': [.0005, .001, .01]}
}


knn = {
    'name': 'K Nearest Neighbors',
    'abbr': 'knn',
    'estimator': KNeighborsClassifier(),
    'pipe_params' : {
                'knn__n_neighbors': [3,5,7],
                'knn__metric': ['manhattan']}
}

    
mnb = {
    'name': 'Multinomial Bayes Classifier',
    'abbr': 'mnb',
    'estimator': MultinomialNB(),
    'pipe_params' : {
                'mnb__fit_prior': [False],
                'mnb__alpha': [0,.1,1]}
}
    
    
svc = {
    'name': 'Support Vector Classifier',
    'abbr': 'svc',
    'estimator': SVC(),
    'pipe_params' : {
                'svc__C': [1,2,3,4,5],
                'svc__kernel': ['linear', 'poly','rbf'],
                'svc__gamma': ['scale'],
                'svc__degree': [1,2,3,4,5],
                'svc__probability': [True]}
}



## Lists of Models and Preprocessors

In [None]:
models = [lr, rf, knn, mnb, svc]
preprocessors = [count_vec, tfidf]

In [109]:
class Model:

    
    def __init__(self, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
###NOTE### Do train test split in DS workflow, no need to double abstract the train test split
    
    
    def make_model(self, prep, est):


     
    
    def scoring(self):
        print('Best Parameters')
        print(self.grid.best_params_)
        print()
        
        print('Best Training Score:')
        print(self.grid.score(self.X_train, self.y_train))
        print()
        
        print('Best Testing Score:')
        print(self.grid.score(self.X_test, self.y_test))
        
        
    def confused(self):
        
        print('Confusion Matrix')
        cm = confusion_matrix(self.y_test, self.predictions)

        confusion_df = pd.DataFrame(cm, 
                                    columns=[f'Pred False: {self.subname2.capitalize()}', 
                                             f'Pred True: {self.subname1.capitalize()}'],
                                        
                                    index=[f'Actual False: {self.subname2.capitalize()}', 
                                           f'Actual True: {self.subname1.capitalize()}'])


        display(confusion_df)

        print()
        print()
        tn, fp, fn, tp = cm.ravel()

        print('### Specificity')

        spec = tn / (tn + fp)
        print(spec)
        print()
        
        print('### Sensitivity/Recall')

        sens = tp / (tp +fn)
        print(sens)
        
        
    def hist_dist_p(self):
        # Create figure.
        plt.figure(figsize = (10,7))

        # Create histogram of observations.
        plt.hist(self.probs, bins=25, color='b')

        # Label axes.
        plt.title(f'Distribution of P({self.subname1.capitalize()}: 1)', fontsize=22)
        plt.ylabel('Frequency', fontsize=18)
        plt.xlabel(f'Predicted Probability that Outcome = 1 ({self.subname1.capitalize()})', fontsize=18);

        
        
           

        
    def outcome_hists(self):
        '''
        * Thanks to Matt Brems for the outline code *
        Function does not need to take any arguments if the model has been fit, unless a different probability or y_test is desired.
        '''

        
            # Create figure.
        plt.figure(figsize = (10,7))

        # Create two histograms of observations.
        hst0 = plt.hist(self.probs[self.y_test == 0],
                 bins=25,
                 color='b',
                 alpha = 0.6,
                 label=f'{self.subname2.capitalize()}: 0',)

        hst1 = plt.hist(self.probs[self.y_test == 1],
                 bins=25,
                 color='orange',
                 alpha = 0.6,
                 label=f'{self.subname1.capitalize()}: 1')

        # Add vertical line at P(Outcome = 1) = 0.5.
        plt.vlines(x=0.5,
                   ymin = 0,
                   ymax = max(hst1[0].max(), hst0[0].max()), # Max of the two highest respective hist values
                   color='r',
                   linestyle = '--')

        # Label axes.
        plt.title(f'Distribution of P({self.subname1.capitalize()})', fontsize=22)
        plt.ylabel('Frequency', fontsize=18)
        plt.xlabel('Predicted Probability that Outcome = 1', fontsize=18)

        # Create legend.
        plt.legend(fontsize=20);

        

    
    
    
    def make_roc(self): 
        '''
        Name as a string, probs as a variable that should be calculated already.
        Specify y_test only if it is different
        '''
        this_auc = roc_auc_score(self.y_test, self.probs)
        this_fpr, this_tpr, thresholds = roc_curve(self.y_test, self.probs)

        plt.figure(figsize=(12, 9))
        plt.plot(this_fpr, this_tpr, color='r', label=this_auc)
        plt.legend(f'ROC curve (area = {this_auc})', loc='lower right', bbox_to_anchor=(0.5, 0., 0.5, 0.5))
        plt.plot([0, 1], [0, 1], 'k--')
        # plt.vlines(1,0,1)
        # plt.hlines(1,0,1)
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.xlabel('False Positive Rate (1 - Specificity)', size=16)
        plt.ylabel('True Positive Rate (Sensitivity)', size=16)
        plt.title(f'ROC Curve: {self.est["name"]}', size=20)
        plt.legend(fontsize=14);
        
        

        
    def make_cloud(self, split=None):
        '''
        No inputs needed.  Uses X value of data, which should be text columns.
        If you want word clouds for each subreddit, use split='y'
        '''
        wc = wordcloud.WordCloud(max_words=50, 
                                 width=700, 
                                 height=400, 
                                 background_color='white',
                                )
        if split == 'y':
            print(f'Cloud for {self.subname1.capitalize()}')
            cloud1 = wc.generate(self.X.loc[self.full_df['subreddit'] == 1].str.cat())
            display(cloud1.to_image())
            print()
            print()
            print(f'Cloud for {self.subname2.capitalize()}')
            cloud0 = wc.generate(self.X.loc[self.full_df['subreddit'] == 0].str.cat())
            display(cloud0.to_image())
        else:
            cloud = wc.generate(self.X.str.cat())
            return cloud.to_image()

        
        
        
       
        
    def compare_models(self, models, preprocessors):
        self.models = models
        self.preprocessors = preprocessors
        
        # Set up the dataframe for comparison with the column names          
        self.model_comp = pd.DataFrame(columns=['Preprocessor',
                                                'Estimator', 
                                                'Best Params', 
                                                'Best Train Score',
                                                'Best Test Score',
                                                'Variance'
                                               ])         
                  
        # Loop through the models and preprocessors
        #### NOTE: Need to make an if statement to not use some preprocessors with some models.
        for model in models:
            for process in preprocessors:
                self.make_model(process, model)
                  
                self.model_comp.loc[len(self.model_comp)] = [
                                        process['name'], 
                                        model['name'], 
                                        self.grid.best_params_, 
                                        self.grid.score(self.X_train, self.y_train), 
                                        self.grid.score(self.X_test, self.y_test),
                                        (self.grid.score(self.X_test, self.y_test))  
                                          - (self.grid.score(self.X_train, self.y_train))
                                      ]

        display(self.model_comp)
                  
                  
                  
                  
# THIS DOESN'T WORK
                  
    def most_words(self):

        feat_names = self.grid.best_estimator_.named_steps[self.prep['abbr']].get_feature_names()

        catcher = self.grid.best_estimator_.named_steps[self.est['abbr']]

        sparse_matrix = self.grid.best_estimator_.named_steps[self.prep['abbr']].fit_transform(self.X_test)

        dense_matrix = sparse_matrix.todense()

        feature_df = pd.DataFrame(dense_matrix, columns=feat_names)

        plt.figure(figsize=(16, 12))
        feature_df.sum().sort_values(ascending=False).head(10).plot(kind='barh');

                  

                  
    def do_everything(self, prep, est):
                  '''
                  Makes model and gives scores and distribution graphs
                  (preprocessor, estimator)
                  '''
                  self.prep = prep
                  self.est = est
                  
                  
                  self.make_model(prep, est)
                  
                  self.scoring()
                  
                  self.confused()
                  
                  self.make_roc()
                  
                  self.hist_dist_p()
                  
                  self.outcome_hists()
                  
                  

## BELOW IS A FUNCTION TO DROP THE TOP 'X' FEATURES FROM THE DATASET                  
                  
#         top_feats = feature_df.sum().sort_values(ascending=False)[:10,].index

        # top_feats

        # my_stops.extend(top_feats)

                  
                  
                  
## THIS IS A FUNCTION THAT PLOTS THE ABSOLUTE COEFFICIENTS
                  ## VARIABLES SHOULD BE DEFINED IN SCORE, CONFUSION, OR OTHER DIST PLOT FUNCTIONS

                  
#         plt.figure(figsize=(16, 12))
#         coef_df = pd.DataFrame(catcher.coef_, columns=feat_names).T
#         coef_df['abs_coef'] = coef_df[0].abs()
#         coef_df.sort_values('abs_coef', ascending=False)[0].head(15).plot(kind='barh');

        # coef_kill = coef_df.sort_values('abs_coef', ascending=False)[0].head(500).index

        # coef_kill


        # my_stops.extend(coef_kill)

In [110]:
test = Model()

In [111]:
test.get_subreddits(1000, .5)

Getting posts from Depression.
We are on round 1, total of 0 posts.
We are on round 2, total of 25 posts.
We are on round 3, total of 50 posts.
We are on round 4, total of 75 posts.
We are on round 5, total of 100 posts.
We are on round 6, total of 125 posts.
We are on round 7, total of 150 posts.
We are on round 8, total of 175 posts.
We are on round 9, total of 200 posts.
We are on round 10, total of 225 posts.
We are on round 11, total of 250 posts.
We are on round 12, total of 275 posts.
We are on round 13, total of 300 posts.
We are on round 14, total of 325 posts.
We are on round 15, total of 350 posts.
We are on round 16, total of 375 posts.
We are on round 17, total of 400 posts.
We are on round 18, total of 425 posts.
We are on round 19, total of 450 posts.
We are on round 20, total of 475 posts.
We are on round 21, total of 500 posts.
We are on round 22, total of 525 posts.
We are on round 23, total of 550 posts.
We are on round 24, total of 575 posts.
We are on round 25, tot

Unnamed: 0,title,subreddit
0,NHS health care..told I don’t need further help..,depression
1,I just want to end it,depression
2,Sort of a random question but how bad is it if...,depression
3,I feel nothing,depression
4,im fucking stupid,depression



Sending Depression to depression.csv.


Getting posts from Anxiety.
We are on round 1, total of 0 posts.
We are on round 2, total of 25 posts.
We are on round 3, total of 50 posts.
We are on round 4, total of 75 posts.
We are on round 5, total of 100 posts.
We are on round 6, total of 125 posts.
We are on round 7, total of 150 posts.
We are on round 8, total of 175 posts.
We are on round 9, total of 200 posts.
We are on round 10, total of 225 posts.
We are on round 11, total of 250 posts.
We are on round 12, total of 275 posts.
We are on round 13, total of 300 posts.
We are on round 14, total of 325 posts.
We are on round 15, total of 350 posts.
We are on round 16, total of 375 posts.
We are on round 17, total of 400 posts.
We are on round 18, total of 425 posts.
We are on round 19, total of 450 posts.
We are on round 20, total of 475 posts.
We are on round 21, total of 500 posts.
We are on round 22, total of 525 posts.
We are on round 23, total of 550 posts.
We are on round 24, total

Unnamed: 0,title,subreddit
0,can anxiety create the symptoms of what seems ...,anxiety
1,Anxiety makes me want to move back home,anxiety
2,Phone anxiety,anxiety
3,i have therapy today for the first time in six...,anxiety
4,"The Fear of Failure is Holding Me Back, and I ...",anxiety




Sending Anxiety to anxiety.csv.


Sending combined dataframe, full_df, to depressionanxiety.csv.


Null Values:


title        0
subreddit    0
dtype: int64



## Baseline
Depression = 1, Anxiety = 0


0    0.500501
1    0.499499
Name: subreddit, dtype: float64



Doing train test split...
Done.


All operations completed. 
 Ready for modeling.


In [104]:
test.all_models()

Modeling with estimator: CountVectorizer
Modeling with model: Logistic Regression
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Time Elapsed Fitting: 0:00:01.072916


Modeling with estimator: TfidVectorizer
Modeling with model: Logistic Regression
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Time Elapsed Fitting: 0:00:00.558386


Modeling with estimator: CountVectorizer
Modeling with model: Random Forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Time Elapsed Fitting: 0:00:00.750078


Modeling with estimator: TfidVectorizer
Modeling with model: Random Forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Time Elapsed Fitting: 0:00:00.837163


Modeling with estimator: CountVectorizer
Modeling with model: K Nearest Neighbors
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Time Elapsed Fitting: 0:00:00.545957


Modeling with estimator: TfidVectorizer
Modeling with model: K Nearest Neighbors
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Time Elapsed Fitting: 0:00:00.036035


Modeling with estimator: CountVectorizer
Modeling with model: Multinomial Bayes Classifier
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Time Elapsed Fitting: 0:00:00.031394


Modeling with estimator: TfidVectorizer
Modeling with model: Multinomial Bayes Classifier
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Time Elapsed Fitting: 0:00:00.024037


Modeling with estimator: CountVectorizer
Modeling with model: Support Vector Classifier
Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Time Elapsed Fitting: 0:00:01.188063


Modeling with estimator: TfidVectorizer
Modeling with model: Support Vector Classifier
Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.5s


Time Elapsed Fitting: 0:00:00.980558




[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:    1.0s finished


Unnamed: 0,Preprocessor,Estimator,Best Params,Best Train Score,Best Test Score,Variance
0,CountVectorizer,Logistic Regression,"{'count_vec__max_df': 0.3, 'count_vec__max_fea...",0.706667,0.64,-0.066667
1,TfidVectorizer,Logistic Regression,"{'lr__C': 1, 'lr__penalty': 'l1', 'tfidf__max_...",0.566667,0.44,-0.126667
2,CountVectorizer,Random Forest,"{'count_vec__max_df': 0.3, 'count_vec__max_fea...",0.76,0.44,-0.32
3,TfidVectorizer,Random Forest,"{'rf__max_depth': 200, 'rf__min_samples_leaf':...",0.993333,0.76,-0.233333
4,CountVectorizer,K Nearest Neighbors,"{'count_vec__max_df': 0.3, 'count_vec__max_fea...",0.706667,0.4,-0.306667
5,TfidVectorizer,K Nearest Neighbors,"{'knn__metric': 'manhattan', 'knn__n_neighbors...",0.646667,0.58,-0.066667
6,CountVectorizer,Multinomial Bayes Classifier,"{'count_vec__max_df': 0.3, 'count_vec__max_fea...",0.7,0.64,-0.06
7,TfidVectorizer,Multinomial Bayes Classifier,"{'mnb__alpha': 0, 'mnb__fit_prior': False, 'tf...",0.993333,0.84,-0.153333
8,CountVectorizer,Support Vector Classifier,"{'count_vec__max_df': 0.3, 'count_vec__max_fea...",0.72,0.68,-0.04
9,TfidVectorizer,Support Vector Classifier,"{'svc__C': 1, 'svc__degree': 1, 'svc__gamma': ...",0.986667,0.82,-0.166667


## Instantiate

## Get Subreddits

## Make Model

## Score Model

## Confusion Maxtrix

## ROC AUC

## Probability Distributions

## Distributions Across Classes (Outcomes)

## WORD CLOUDS!!