In [1]:
import pandas as pd
import regex as re
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.metrics import accuracy_score 
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB 

import time

from IPython.display import display, HTML
from sklearn.naive_bayes import GaussianNB
from sklearn.base import TransformerMixin
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer

sns.set(style='darkgrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore",category=DeprecationWarning)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier


In [2]:
## List of all the parameters that will try on the data - from this week
all_params = {1:{ "Pipeline" : [ ('vectorizer', CountVectorizer()),
                    ('classifier', DecisionTreeClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0],
                       'classifier__max_depth': [10, 20, 30],
                       'classifier__min_samples_split': [2, 5, 10],
                       'classifier__min_samples_leaf':[3,5]
                    },
 },
 2:{ "Pipeline" : [ ('vectorizer', TfidfVectorizer()),
                    ('classifier', DecisionTreeClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0],
                       'classifier__max_depth': [10, 20, 30],
                       'classifier__min_samples_split': [2, 5, 10],
                       'classifier__min_samples_leaf':[3,5]
                    },
 },
 3:{ "Pipeline" : [ ('vectorizer', CountVectorizer()),
                    ('classifier', BaggingClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0],
                       'classifier__n_estimators': [10, 50, 100]
                    },
 },
 4:{ "Pipeline" : [ ('vectorizer', TfidfVectorizer()),
                    ('classifier', BaggingClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0],
                       'classifier__n_estimators': [10, 50, 100]
                    },
 },
 5:{ "Pipeline" : [ ('vectorizer', CountVectorizer()),
                    ('classifier', RandomForestClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0]
                    },
 },
 6:{ "Pipeline" : [ ('vectorizer', TfidfVectorizer()),
                    ('classifier', RandomForestClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0]
                    },
 },
7:{ "Pipeline" : [ ('vectorizer', CountVectorizer()),
                    ('classifier', ExtraTreesClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0],
                       'classifier__bootstrap':[True, False]
                    },
 },
 8:{ "Pipeline" : [ ('vectorizer', TfidfVectorizer()),
                    ('classifier', ExtraTreesClassifier(random_state=42))],
     "hyper_params": { 'vectorizer__max_features': [1000, 2000, 5000],
                       'vectorizer__ngram_range': [(1,3)],
                       'vectorizer__stop_words': ['english'],
                       'vectorizer__min_df':[1],
                       'vectorizer__max_df':[1.0],
                       'classifier__bootstrap':[True, False]
                     }
 }
}

# all_params

In [35]:
def get_bold(x):
    open_tag_bold = "\033[1m"
    close_tag_bold = "\033[0m"
    return open_tag_bold + str(x) + close_tag_bold

# display scores 
def display_scores(scores, no_of_posts):
    
    print("\nMetrics for {} posts".format(get_bold(no_of_posts)))
    df = pd.DataFrame(scores, index=list(range(1,4)))
    df = df.sort_values(by="Accuracy", ascending=True)
#     display(df)
    display(HTML(df.to_html()))
#     sns.set(style="darkgrid")
    df["Feature Set"] = df.index
    nrows = 1 # Makes sure you have enough rows
    fig, ax = plt.subplots(nrows=nrows, ncols=8, figsize=(18,3)) # You'll want to specify your figsize
    ax = ax.ravel() # Ravel turns a matrix into a vector, which is easier to iterate
    col_list = ["Accuracy", "Mis Calculations","ROC AUC","Sensitivity","Specificity","Precision", "Test Scores","Train Scores"]
    col_colr = ["blue", "slategray", "green", "sienna", "teal", "firebrick", "lightseagreen", "mediumorchid"]
    for i, column in enumerate(col_list): # Gives us an index value to get into all our lists
        sns.scatterplot(df[column], df["Feature Set"],marker="o", size=df[column], legend=False, facecolor=col_colr[i], edgecolor=col_colr[i], ax=ax[i])
        ax[i].set_ylabel("");

#highlight minimum value form the data frame
def highlight_min(s):    
    is_min = s == s.min()
    return ['background-color: lightgreen' if v else '' for v in is_min]

#highlight max value from the data frame
def highlight_max(s):    
    is_max = s == s.max()
    return ['background-color: lightgreen' if v else '' for v in is_max]


#combine full row into a single row
def get_full_row(data, features):
    data["full_row"] = ""
    for col in features:
        data["full_row"] = data["full_row"] + "," + data[col].astype(str)
        

#return best estimator results
def get_score_data(results, X_train, X_test, y_train,y_test):
    output = []
    sub_scores = {}
    
    model = results.best_estimator_
    
    #confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, model.predict(X_test)).ravel()
    
    ##calulating accuracy
#     accuracy = (tp + tn) / (tp + fp + tn + fn)
    accuracy = round(accuracy_score(y_test, model.predict(X_test)),2)
    
    #calculating Misclassification Rate
    mis_calcuations = 1 - accuracy
    
    #calculating sensitivity
    sensitivity = tp / (tp + fn)

    #calculating specificity
    specificity = tn / (tn + fp)

    #calculating precision
    precision = tp / (tp + fp)
    
    #to predict roc_auc_score
    pred_proba = [i[1] for i in model.predict_proba(X_test)]

    pred_df = pd.DataFrame({'true_values': y_test,
                        'pred_probs':pred_proba})
    
    #For returning results from the best estimator
    #1. best score
    output.append(round(results.best_score_,2))
    
    #2.best params
    output.append(results.best_params_)
    
    #No of 0's and 1's in test
    sub_scores.update({"No of evolution posts": y_test[y_test == 0].count()})
    sub_scores.update({"No of Creation posts": y_test[y_test == 1].count()})
    
    #No of 0's and 1's predicted
    df = pd.DataFrame({"Preds": model.predict(X_test)})
    try:
        sub_scores.update({"No of predicted evolution posts": df.groupby("Preds")["Preds"].value_counts()[0].values[0]})
    except:
        sub_scores.update({"No of predicted evolution posts":0})
    
    try:
        sub_scores.update({"No of predicted creations posts": df.groupby("Preds")["Preds"].value_counts()[1].values[0]})
    except:
        sub_scores.update({"No of predicted creations posts":0})
    
    #baseline
    sub_scores.update({"Baseline accuracy%": round(y_test.value_counts(normalize=True)[0],2)})
    
    #3.Train Score
    sub_scores.update({"Train Scores": round(model.score(X_train,y_train),2)})
    
    #4.Test Score
    sub_scores.update({"Test Scores": round(model.score(X_test,y_test),2)})
    
    #5.Accuracy
    sub_scores.update({"Accuracy": accuracy})
    
    #6.Mis Calculations
    sub_scores.update({"Mis Calculations": round(mis_calcuations,2)})

    #7.Sensitivity
    sub_scores.update({"Sensitivity": round(sensitivity,2)})
    
    #8.Specificity
    sub_scores.update({"Specificity": round(specificity,2)})

    #9.Precision
    sub_scores.update({"Precision": round(precision,2)})

    #9.ROC AUC
    sub_scores.update({"ROC AUC": round(round(roc_auc_score(pred_df['true_values'], pred_df['pred_probs']),2))})
    
    10.
    if model.score(X_train,y_train) > model.score(X_test,y_test):
        sub_scores.update({"Fit Type":"Overfit"})
    else:
        sub_scores.update({"Fit Type":"Underfit"})
        
    output.append(sub_scores)
    
    return output
    
    
    

#simple model,fit,evaluate, and return a list of best estimators and 
def model_fit_score(X, y, best_scores, feature_set_no):
     
        #Step 1 : split the data into test/train
        X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True, test_size=0.33, stratify=y, random_state = 42)

        for param in all_params:

            pipe = Pipeline(all_params[param]["Pipeline"])
            hyper_params = all_params[param]["hyper_params"]
             
            # Perform Grid Search
            gridcv = GridSearchCV(pipe, 
                                  param_grid=hyper_params,
                                  cv = 5,
                                  scoring = "accuracy")
            #results
            results = gridcv.fit(X_train, y_train)
            best_scores[param] = get_score_data(results, X_train, X_test, y_train,y_test)
            #wait for 5 seconds until moving on
            time.sleep(5)
            print("Round {} complete for feature set {}".format(param, feature_set_no))
        
        return best_scores
            
    
#combine two subreddits
def combine(subreddit1, subreddit2):
    all_data_df = pd.concat((subreddit1, subreddit2), axis=0)
    sub1 = subreddit1["subreddit"].unique()[0]
    sub2 = subreddit2["subreddit"].unique()[0]
    
    all_data_df["y"] = all_data_df["subreddit"].map({sub1: 0, sub2: 1})
    
    return all_data_df

         
def lemmatize(s):
    list_words = s.split(",")
    lemmatizer = WordNetLemmatizer()
    return ','.join([lemmatizer.lemmatize(word) for word in list_words])

def stemmer(s):
    list_words = s.split(",")
    stemmer = PorterStemmer()
    return ','.join([stemmer.stem(word) for word in list_words])

feature_sets={"1": ["title"],
              "2": ["title", "selftext"],
              "3": ["title", "selftext", "comment"],
             }


#EDA for each run
def get_best_scores_params(run):
    evolution_sub_clean_df = pd.read_csv("./datasets/evolution_sub_clean_" + run+ ".csv")
    creation_sub_clean_df = pd.read_csv("./datasets/creation_sub_clean_" + run + ".csv")
    all_data_df = combine(evolution_sub_clean_df, creation_sub_clean_df)    

    
    best_scores = {x:{y:[] for y in all_params} for x in feature_sets}

    for features in feature_sets:
        get_full_row(all_data_df, feature_sets[features])
        model_fit_score(all_data_df["full_row"], all_data_df["y"], best_scores[features], features)

            
    return best_scores

In [36]:
best_scores = get_best_scores_params("1")

Round 1 complete for feature set 1
Round 2 complete for feature set 1
Round 3 complete for feature set 1
Round 4 complete for feature set 1
Round 5 complete for feature set 1




Round 6 complete for feature set 1
Round 7 complete for feature set 1
Round 8 complete for feature set 1
Round 1 complete for feature set 2
Round 2 complete for feature set 2
Round 3 complete for feature set 2
Round 4 complete for feature set 2
Round 5 complete for feature set 2
Round 6 complete for feature set 2
Round 7 complete for feature set 2
Round 8 complete for feature set 2
Round 1 complete for feature set 3
Round 2 complete for feature set 3
Round 3 complete for feature set 3
Round 4 complete for feature set 3
Round 5 complete for feature set 3
Round 6 complete for feature set 3
Round 7 complete for feature set 3
Round 8 complete for feature set 3


In [37]:
metrics_dict = {x:{y:None for y in all_params} for x in feature_sets}
miss_calc_dict = {x:{y:None for y in all_params} for x in feature_sets}

In [38]:
for features in feature_sets:
    for params in all_params:
        metrics_dict[features][params] = best_scores[features][params][0]
for features in feature_sets:
    for params in all_params:
        miss_calc_dict[features][params] = best_scores[features][params][2]['Mis Calculations']

In [39]:
pd.DataFrame(metrics_dict).style.apply(highlight_max)

Unnamed: 0,1,2,3
1,0.65,0.72,0.74
2,0.64,0.68,0.67
3,0.63,0.72,0.73
4,0.64,0.73,0.73
5,0.6,0.68,0.72
6,0.6,0.72,0.69
7,0.66,0.69,0.75
8,0.63,0.74,0.72


In [40]:
##best score from the above board, looking at the Scores in details
best_scores["3"][7]

[0.75,
 {'classifier__bootstrap': False,
  'vectorizer__max_df': 1.0,
  'vectorizer__max_features': 2000,
  'vectorizer__min_df': 1,
  'vectorizer__ngram_range': (1, 3),
  'vectorizer__stop_words': 'english'},
 {'No of evolution posts': 33,
  'No of Creation posts': 33,
  'No of predicted evolution posts': 41,
  'No of predicted creations posts': 25,
  'Baseline accuracy%': 0.5,
  'Train Scores': 0.99,
  'Test Scores': 0.76,
  'Accuracy': 0.76,
  'Mis Calculations': 0.24,
  'Sensitivity': 0.64,
  'Specificity': 0.88,
  'Precision': 0.84,
  'ROC AUC': 1.0,
  'Fit Type': 'Overfit'}]

In [41]:
#Highlighting the minimum miss calculations or the minimum difference between test and train scores
pd.DataFrame(miss_calc_dict).style.apply(highlight_min)

Unnamed: 0,1,2,3
1,0.48,0.26,0.32
2,0.5,0.27,0.24
3,0.45,0.26,0.24
4,0.42,0.24,0.27
5,0.48,0.21,0.24
6,0.5,0.3,0.32
7,0.39,0.21,0.24
8,0.39,0.23,0.27


# Conclusion : From the above we can see that, Logistic Regression with tokenizer worked the best