## Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier

from bs4 import BeautifulSoup 

import nltk
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

%matplotlib inline

## Import data

In [3]:
# import dataset from AskMen and AskWomen subreddit
df = pd.read_csv('./Datasets/askmen_top.csv')
df2 = pd.read_csv('./Datasets/askwomen_top.csv')

In [4]:
# set dataframe display to show full text 
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_columns',10000)

In [5]:
# return only columns required and assign new variables
askwomen = df2[['title','subreddit','selftext']]
askmen = df[['title','subreddit','selftext']]

In [6]:
# get shape of askmen dataframe
askmen.shape

(1000, 3)

In [7]:
# get shape of askwomen dataframe
askwomen.shape

(1000, 3)

## Combine data

In [8]:
# combine into one dataframe
df = pd.concat([askwomen, askmen])

In [9]:
# view dataframe
df.head(3)

Unnamed: 0,title,subreddit,selftext
0,"Reminder: Trans women are women. If you see transphobic commentary on this subreddit, please report",AskWomen,"Recently, we've seen an uptick in transphobic commentary. We wanted to take this time to reiterate our commitment to trans women feeling welcome here. It's askwomen policy that trans women are women, full stop, no qualifiers. So if you see transphobic commentary, please report it. And we will continue to not allow bigotry in this subreddit."
1,"A new dating app is launched. Instead of a photo of a person, it shows you a photo of their bedroom, car, kitchen, shoes, how they have their tea/coffee, things like that... what photo would tell you the most about someone, and would you be most interested to see to choose a potential date?",AskWomen,
2,"When Kamala Harris said ‘I am speaking’ while she was being interrupted over and over, how did that resonate with you?",AskWomen,"Sorry guys - this post has gotten traction because it resonated with a lot of people but the mods have locked it indefinitely. \n\nI posted this question to understand what a moment felt for many women after I saw my own sister wince. It’s small question but the response has been powerful. I feel a lot of people can be heard and a lot of people like myself can learn. \n\nHopefully if they open this sooner rather than later, we can hear more experiences and comments geared towards the question in hand. I am not sure exactly why this question in particular has been locked for this long.\n\nEdit 2: it’s been a month and it’s looking like this post was locked because of its content as opposed to clearing out any comments as the mods have suggested. Wonder if they had an issue with the question or the Kamala Harris?"


In [10]:
# replace nans with blank 
df.fillna("",inplace=True)
# combine all text into one column
df['combined'] = df['title'] +" "+ df['selftext']
# remove columns that are no longer in use
df.drop(columns=['title', 'selftext'], inplace=True)

In [11]:
# check for null values
df.isnull().sum()

subreddit    0
combined     0
dtype: int64

In [12]:
# check data for ratio between the 2 subreddits
df['subreddit'].value_counts(normalize=True)

AskMen      0.5
AskWomen    0.5
Name: subreddit, dtype: float64

## Cleaning 

In [13]:
# Function to convert a raw review to a string of words
# The input is a single string (a raw post), and 
# the output is a single string (a preprocessed post)
  
def review_to_words(raw_review):
    
    # 1. Remove HTML.
    review_text = BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters and https
    letters_only = re.sub("[^a-zA-Z]|https", " ", review_text)
    
    # 4. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 5. In Python, searching a set is much faster than searching a list, so convert the stopwords to a set.
    stops = set(stopwords.words('english') 
    others = ['women','men','ladies','guys','men','reddit'])
    
    # 6. Remove stopwords.
    meaningful_words = [w for w in words if w not in stops]
    
    # 7. Join the words back into one string separated by space, and return the result.
    return(" ".join(meaningful_words))

In [419]:
#sanity check: check stopwords to see what has been removed
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
# Initialize an empty list to hold the clean reviews.
clean_train_data = []

# For every review in our training set...
for train_data in df['combined']:
    
    # Convert review to words, then append to clean_train_reviews.
    clean_train_data.append(review_to_words(train_data))

In [15]:
clean_train_data[0]

'reminder trans see transphobic commentary subreddit please report recently seen uptick transphobic commentary wanted take time reiterate commitment trans feeling welcome askwomen policy trans full stop qualifiers see transphobic commentary please report continue allow bigotry subreddit'

## Stemming

In [16]:
# Instantiate object of class PorterStemmer.
#stem_word = nltk.stem.SnowballStemmer('english')

#for index, review in enumerate(clean_train_data):
    #tokenize the sentence and find the POS tag for each token
#    tokenization = nltk.word_tokenize(review)
#    stemreview = ""
#    for w in tokenization:
#            stemreview = stemreview + " " + stem_word.stem(w)
#    clean_train_data[index]=stemreview

In [17]:
#clean_train_data[0]

In [18]:
#stem_word.stem('fairly')

## Lemmatization

In [19]:
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):    # if pos tag starts with J, word is an adjective 
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):  # if pos tag starts with V, word is a verb
        return wordnet.VERB
    elif nltk_tag.startswith('N'):  # if pos tag starts with N, word is a noun
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):  # if pos tag starts with R, word is an adverb
        return wordnet.ADV
    else:          
        return None

In [20]:
def lemmatize_data(df):
    # instantiate WordNetLetmmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    for index, post in enumerate(clean_train_data):
        # tokenize the sentence and find the POS tag for each token
        tokenization = nltk.pos_tag(nltk.word_tokenize(post))
        # replace tuple(token, pos_tag) pos tag with wordnet tag
        wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), tokenization)
        # create empty string
        lemmapost = ""
        for word, tag in wordnet_tagged:
            if tag is None:
                #if there is no available tag, append the token as is
                lemmapost = lemmapost + " " + word
            else:        
                #else use the tag to lemmatize the token
                lemmapost = lemmapost + " " + wordnet_lemmatizer.lemmatize(word,tag)
        df[index]=lemmapost

In [21]:
lemmatize_data(clean_train_data)

In [384]:
cv = CountVectorizer()

In [386]:
cv_fit = cv.fit_transform(clean_train_data)

In [407]:
most_frequent_words = pd.DataFrame(cv_fit.toarray().sum(axis=0).tolist(), cv.get_feature_names())

In [412]:
most_frequent_words.shape

(6340, 1)

In [410]:
most_frequent_words.sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
like,801
get,714
feel,680
make,526
go,503
know,469
want,444
time,439
thing,410
say,402


In [415]:
most_frequent_words[most_frequent_words[0]<4]

Unnamed: 0,0
aaaaand,1
aaaam,1
ab,1
abdomen,1
abnormal,2
...,...
zip,1
zodiac,1
zoneeeee,1
zoom,1


## Train Test Split 

In [22]:
X = clean_train_data
y = df['subreddit']

In [23]:
# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.25,
                                                    stratify = y,
                                                    shuffle=True,
                                                    random_state = 42)

In [24]:
len(X_train)

1500

## CVEC LR

In [423]:
lrcvecpipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(random_state=42, max_iter =1000))
])

In [424]:
lrcvecpipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('lr', LogisticRegression(max_iter=1000, random_state=42))],
 'verbose': False,
 'cvec': CountVectorizer(),
 'lr': LogisticRegression(max_iter=1000, random_state=42),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 1000,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty': 'l2',
 'lr__random_state': 42,
 'lr__solver': 'lbfgs',
 'lr__tol'

In [425]:
lrcvecpipe_params={'cvec__max_df': [0.3,0.5],
                 'cvec__max_features': [2000, 3000, 4000],
                 'cvec__min_df': [2, 3, 4],
                 'cvec__ngram_range': [(1, 1), (1, 2)],
                 'lr__C': [1.0, 2.0, 3.0],
                 'lr__penalty': ['l1', 'l2'],
                 'lr__solver': ['liblinear']}

In [426]:
lrcvecgs = GridSearchCV(
     lrcvecpipe, # what object are we optimizing?
     param_grid = lrcvecpipe_params,
     cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [427]:
lrcvecgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=1000,
                                                           random_state=42))]),
             param_grid={'cvec__max_df': [0.3, 0.5],
                         'cvec__max_features': [2000, 3000, 4000],
                         'cvec__min_df': [2, 3, 4],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'lr__C': [1.0, 2.0, 3.0], 'lr__penalty': ['l1', 'l2'],
                         'lr__solver': ['liblinear']})

In [428]:
lrcvecgs.best_score_

0.7426666666666666

In [429]:
lrcvecgs.best_params_

{'cvec__max_df': 0.3,
 'cvec__max_features': 3000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'lr__C': 1.0,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear'}

In [430]:
lrcvecgs_bestmodel = cvecgs.best_estimator_

In [431]:
lrcvecgs_bestmodel.score(X_train, y_train)

0.9606666666666667

In [432]:
lrcvecgs_bestmodel.score(X_test, y_test)

0.724

In [433]:
#get coefficients of X variables from logistic regression 
lrcvecgs_bestmodel['lr'].coef_

array([[-0.44703243, -0.25006734,  0.18507402, ..., -0.23263004,
         0.07860233, -0.11288913]])

In [434]:
lrcveccoef = pd.DataFrame(lrcvecgs_bestmodel['lr'].coef_, columns = lrcvecgs_bestmodel['cvec'].get_feature_names()).T

In [458]:
# coefficients for ask women
lrcveccoef.sort_values(ascending=False, by=0).head(20)

Unnamed: 0,0
look like,1.010722
online,0.983756
tell difference,0.956278
mean,0.946386
career,0.943653
partner,0.891403
song,0.877732
hair,0.864421
positive,0.857754
majority,0.841509


In [39]:
# coefficients for ask men
lrcveccoef.sort_values(ascending=False, by=0).tail(20)

Unnamed: 0,0
turn,-0.910324
ask,-0.914292
type,-0.932124
male,-0.983104
female friend,-0.988178
understand,-1.013843
shit,-1.03497
seem,-1.076046
pee,-1.091647
dude,-1.103211


In [40]:
# create functions to find posts with keyword
def review_with_word(word, lst):
    all_reviews = []
    for post in lst:
        if len(re.findall(f"\s{word}\W", post))> 0:
            all_reviews.append(post)
    return pd.DataFrame(all_reviews)

In [41]:
type(clean_train_data)

list

In [42]:
review_with_word('draw', list(df['combined']))

Unnamed: 0,0
0,"Where do you draw the line between “following your dreams” and being realistic about career, relationships etc?"
1,"Guys named Drew, what did you draw? Thank you, Drews! 🚀"
2,What's your favourite type of fish? If you can draw it's that's a bonus. \n\nCrustaceans and other marine wildlife are also accepted
3,"Married men, how much sacrifice/compromise did you make with your SO to make the relationship work/survive? I'm at the point in my life where my friends are getting married/engaged, and struggling to find where the fine line is between being whipped and making compromises. My cousin talked for years about not wanting kids but now is going to have them because his fiance wants them. \n\nWhere in compromise/sacrifice do you guys draw the line on what is and isn't worth it? I know it's a case by case basis, but just curious how other people feel"
4,"Guys who are good at talking to women you find attractive, how do you do it? I’m far from perfect but one of the positive things I have going for me is I have the gift of the gab. I’m good at conversation and I like to think I’m quite witty. Except...\n\nI have no problem talking to girls I’m not attracted to, but I have some kind of mental block when it comes to a girl I am attracted to. My mind just goes blank and I can’t think of anything to say. What usually comes out is some kind of bland comment about the weather or something and it’s just painfully awkward. In a prolonged conversation it usually gets better but I really wish I was one of those guys who could meet an attractive woman and be really funny and interesting straight off. What annoys me is I know I can be funny and interesting, but for some reason I draw a blank when it comes to girls I’m into.\n\nAny advice on how to fix this?\n\nEdit:\n\nShould have mentioned - yes, I know that attractive girls are people too. I’m not trying to objectify them - I know that the trick is to “just talk to them as if they were anyone else”. My problem is in the execution of that. My mind just goes blank and I can’t think of what I would say to a normal person. It’s almost as if the normal, easy-going conversational bit of my brain just goes on holiday or something."
5,"MOD POST: On Mental Health Sup shitlords,\n\nWe as moderators understand that times have been tough on everyone. Everyone has something that they're going through that is difficult and you want to feel like you're not alone. The good news for you is that you're all useless morons to us mods so that's comforting.\n\nBut coming to reddit to ask about mental health resources and how to improve your mental state is just plain stupid. Like we get that you may think there is nowhere to turn, but there really is. Literally any place on the internet is better than Reddit. Reddit is a toxic shithole which will just draw you in further and make you more of an angry son of a bitch. There's even [articles and research done on how shitty web forums are](https://scholars.org/contribution/countering-online-toxicity-and-hate-speech) and studies have shown that [anonymity like Reddit users have just make it worse](https://journals.sagepub.com/doi/10.1177/2056305116664220). If 40% of internet users have experienced online harassment, why would you trust the internet to fix your mental health problems? \n\nWe get that you want to find a place where you feel like other people are going through the same stuff that you are, and there's honestly a lot of good in that. But there are several established factors working against you on reddit:\n\n1) Most people here are making shit up to get reactions out of you.\n\n2) If they're not making shit up, they're trying to make you as miserable as they are so they can feel better about themselves.\n\n3) Reddit is a toxic shithole in general.\n\nWhile there are definitely some people here who want to get better, the majority don't. They don't want you to feel good. They want you to feel miserable because they think that making other people miserable will make them feel better. \n\n**So from now on, Automod will automatically remove all posts about dealing with depression, suicide, and men's mental health. Automod will also link a series of sites that either provide direct links to mental health counselors, or will point you to sites with mental health counselors in your country.**\n\nInb4 ""mods don't care about mental health"": we do care about mental health deeply. Most of us either see therapists or have seen therapists in the past. Which is why we recognize the value in seeing someone who is ACTUALLY TRAINED to help you, instead of listening to some unqualified rando on the internet.\n\nLove,\n\nThe mods\n\n**tl;dr mental health posts now banned, automod will link mental health resources in the removal.**"
6,How should a man treat teenage girls when he's scared of them ? I am 23 and am scared of them when a cousin of mine was falsely accused of molesting a 13 year old.\n\nI don't want to be asshole to my cousins who come to me and I also want to draw a line .\n\nHow do you do that ?


## LR Tvec

In [43]:
lrtvecpipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression(random_state=42, max_iter =1000))
])

In [44]:
lrtvecpipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()),
  ('lr', LogisticRegression(max_iter=1000, random_state=42))],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'lr': LogisticRegression(max_iter=1000, random_state=42),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 1000,
 'lr__multi_class': 'auto'

In [54]:
lrtvecpipe_params={'tvec__max_df': [0.5, 0.7],
             'tvec__max_features': [2000, 3000, 4000],
             'tvec__min_df': [2, 3, 4],
             'tvec__ngram_range': [(1, 1), (1, 2)],
             'lr__C': [1.0, 2.0, 3.0],
             'lr__penalty': ['l1', 'l2'],
             'lr__solver': ['liblinear']}

In [55]:
lrtvecgs = GridSearchCV(
    lrtvecpipe, # what object are we optimizing?
    param_grid = lrtvecpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [56]:
lrtvecgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=1000,
                                                           random_state=42))]),
             param_grid={'lr__C': [1.0, 2.0, 3.0], 'lr__penalty': ['l1', 'l2'],
                         'lr__solver': ['liblinear'],
                         'tvec__max_df': [0.5, 0.7],
                         'tvec__max_features': [2000, 3000, 4000],
                         'tvec__min_df': [2, 3, 4],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [62]:
lrtvecgs.best_params_

{'lr__C': 2.0,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear',
 'tvec__max_df': 0.5,
 'tvec__max_features': 3000,
 'tvec__min_df': 3,
 'tvec__ngram_range': (1, 2)}

In [63]:
lrtvecgs.best_score_

0.7453333333333333

In [308]:
lrtvecgs_bestmodel = tvecgs.best_estimator_

In [309]:
lrtvecgs_bestmodel.score(X_train, y_train)

0.94

In [310]:
lrtvecgs_bestmodel.score(X_test, y_test)

0.722

We notice that the function 'best_estimator_' returns the best score on the train dataset. However, as we continuously tune the parameter to give the optimum best parameters, the model will start to overfit to the training data and its accuracy on the test dataset would drop. 

With that in mind, the 'best_estimator_' as the name suggest is only an estimate of the best paramaters for the model but not necessarily the absolute best parameter for the model to predict unseen data. 

In [67]:
#get coefficients of X variables from logistic regression 
lrtvecgs_bestmodel['lr'].coef_

array([[-0.43450737, -0.66309424,  0.22332126, ..., -0.23461423,
         0.08088559, -0.20444509]])

In [68]:
lrtveccoef = pd.DataFrame(lrtvecgs_bestmodel['lr'].coef_, columns = lrtvecgs_bestmodel['tvec'].get_feature_names()).T

In [380]:
wrongly_classified_data

Unnamed: 0,descr,actual,predicted
2,continued date person everyone tell break,AskWomen,AskMen
3,favorite nonsexual activity,AskMen,AskWomen
4,stuck mile away boyfriends girlfriend pandemic meet together month two manage keep relationship go,AskWomen,AskMen
5,boundary non negotiable,AskMen,AskWomen
6,pratice self love see attractive without validation external source,AskMen,AskWomen
...,...,...,...
483,get back together someone cheat give second chance work,AskWomen,AskMen
485,play rpgs dm gm help safe feel welcome table especially look advice regard game player might know everyone anyone except gm table edit want thank everyone comment helpful,AskWomen,AskMen
491,stop go gym covid make gain want gain covid belly joke,AskMen,AskWomen
493,valentine day mega thread check thing gift food plan valentine day order avoid sea valentine galentine post one mega thread thread rule advice gift relax ask away also obviously ask relationship stuff monday look advice make sure descriptive succinct well information give good answer receive suggest sort new see well new stuff,AskWomen,AskMen


In [69]:
lrtveccoef.sort_values(ascending=False, by=0).head(20)

Unnamed: 0,0
etc,1.623913
partner,1.609759
hair,1.544934
career,1.489644
mean,1.294003
best,1.275973
ever,1.26824
tell difference,1.197554
look like,1.193451
online,1.181481


In [70]:
lrtveccoef.sort_values(ascending=False, by=0).tail(20)

Unnamed: 0,0
time,-1.704197
nsfw,-1.706088
turn,-1.719958
seem,-1.740754
date,-1.749106
mine,-1.784945
see,-1.828883
wife,-1.909598
go,-1.990935
talk,-2.053733


#### KNN Classifier Cvec

The KNN has scored the worst amongst a series of tests done on different models. With that in mind, I have sought to deep dive into the parameter tuning to see if we can get significant improvement to the KNN accuracy.

In [71]:
knncvecpipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In deterimining the parameters for the KNN classifier, we will exclude the Euclidean distance because it is a oversimplified distance measuring technique that may not do well with multidimensional and sparse data. Further readings, can be looked at [here](https://stats.stackexchange.com/questions/29627/euclidean-distance-is-usually-not-good-for-sparse-data-and-more-general-case).

To test, I have used the same pipe parameters over different knn metrics: 
1. The Eucliladean metric
    best score : 0.55  
    train score : 0.71   
    test score : 0.53
    
2. The Manhattan metric
    best score : 0.54  
    train score : 0.698   
    test score : 0.472
    
3. The Minkowski metric
    best score : 0.55  
    train score : 0.997   
    test score : 0.53

From this test we can see that the best overall metric is the Minkowski metric. However, the KNN seems to score fairly bad in all different types of metric and hence, may not be the best model for our dataset.  

In [96]:
knncvecpipe_params={'knn__leaf_size': [10,30,50],
                    'knn__metric': ['manhattan'],
                    'knn__n_neighbors': [5, 21, 35],
                    'knn__weights': ['uniform', 'distance'],
                    'cvec__max_df': [0.3,0.5],
                    'cvec__max_features': [1000, 2000, 3000],
                    'cvec__min_df': [2, 3, 4],
                    'cvec__ngram_range': [(1, 1), (1, 2)]}

In [87]:
knncvecpipe_params2={'knn__leaf_size': [10,30,50],
                    'knn__metric': ['minkowski'],
                    'knn__n_neighbors': [5, 21, 35],
                    'knn__weights': ['uniform', 'distance'],
                    'cvec__max_df': [0.3,0.5],
                    'cvec__max_features': [1000, 2000, 3000],
                    'cvec__min_df': [2, 3, 4],
                    'cvec__ngram_range': [(1, 1), (1, 2)]}

In [97]:
knncvecgs = GridSearchCV(
    knncvecpipe, # what object are we optimizing?
    param_grid = knncvecpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [88]:
knncvecgs2 = GridSearchCV(
    knncvecpipe, # what object are we optimizing?
    param_grid = knncvecpipe_params2,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [98]:
knncvecgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'cvec__max_df': [0.3, 0.5],
                         'cvec__max_features': [1000, 2000, 3000],
                         'cvec__min_df': [2, 3, 4],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'knn__leaf_size': [10, 30, 50],
                         'knn__metric': ['manhattan'],
                         'knn__n_neighbors': [5, 21, 35],
                         'knn__weights': ['uniform', 'distance']})

In [89]:
knncvecgs2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'cvec__max_df': [0.3, 0.5],
                         'cvec__max_features': [1000, 2000, 3000],
                         'cvec__min_df': [2, 3, 4],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'knn__leaf_size': [10, 30, 50],
                         'knn__metric': ['minkowski'],
                         'knn__n_neighbors': [5, 21, 35],
                         'knn__weights': ['uniform', 'distance']})

In [99]:
knncvecgs.best_score_

0.5433333333333333

In [91]:
knncvecgs2.best_score_

0.5553333333333332

In [100]:
knncvecgs.best_params_

{'cvec__max_df': 0.3,
 'cvec__max_features': 2000,
 'cvec__min_df': 4,
 'cvec__ngram_range': (1, 1),
 'knn__leaf_size': 10,
 'knn__metric': 'manhattan',
 'knn__n_neighbors': 5,
 'knn__weights': 'uniform'}

In [92]:
knncvecgs2.best_params_

{'cvec__max_df': 0.3,
 'cvec__max_features': 1000,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 2),
 'knn__leaf_size': 10,
 'knn__metric': 'minkowski',
 'knn__n_neighbors': 21,
 'knn__weights': 'distance'}

In [101]:
knncvecgs_bestmodel = knncvecgs.best_estimator_

In [93]:
knncvecgs2.bestmodel = knncvecgs2.best_estimator_

In [102]:
knncvecgs_bestmodel.score(X_train,y_train)

0.698

In [94]:
knncvecgs2.bestmodel.score(X_train,y_train)

0.9973333333333333

In [103]:
knncvecgs_bestmodel.score(X_test,y_test)

0.472

In [95]:
knncvecgs2.bestmodel.score(X_test,y_test)

0.53

In [None]:
plt.scatter(
    df['petal length (cm)'],
    df['petal width (cm)'],
    color = df['species'].map({0: 'red', 1: 'green', 2:'blue'}))

#### KNN Classifier Tfid

As we have determine Minkowski to be the best metric above, we will use the SKlearn default metric of the KNNClassifier here. 

In [106]:
knntfidpipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
     ])

In [107]:
knntfidpipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()), ('knn', KNeighborsClassifier())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'knn': KNeighborsClassifier(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 5,
 'knn__p': 2,
 'knn__weights': 'uniform'}

In [123]:
knntfidpipe_params={'knn__n_neighbors': [5, 21, 35],
             'knn__leaf_size': [10,30],
             'knn__weights': ['distance', 'uniform'],
             'tvec__max_df': [0.5, 0.7],
             'tvec__max_features': [2000],
             'tvec__min_df': [3, 4, 5],
             'tvec__ngram_range': [(1, 2)]}

In [124]:
knntfidgs = GridSearchCV(
    knntfidpipe, # what object are we optimizing?
    param_grid = knntfidpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [125]:
knntfidgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__leaf_size': [10, 30],
                         'knn__n_neighbors': [5, 21, 35],
                         'knn__weights': ['distance', 'uniform'],
                         'tvec__max_df': [0.5, 0.7],
                         'tvec__max_features': [2000],
                         'tvec__min_df': [3, 4, 5],
                         'tvec__ngram_range': [(1, 2)]})

In [126]:
knntfidgs.best_params_

{'knn__leaf_size': 10,
 'knn__n_neighbors': 21,
 'knn__weights': 'distance',
 'tvec__max_df': 0.5,
 'tvec__max_features': 2000,
 'tvec__min_df': 3,
 'tvec__ngram_range': (1, 2)}

In [303]:
knntfidgs.best_score_

0.712

In [127]:
knntfidgs_bestmodel = knntfidgs.best_estimator_

In [128]:
knntfidgs_bestmodel.score(X_train, y_train)

0.9986666666666667

In [129]:
knntfidgs_bestmodel.score(X_test, y_test)

0.65

We conclude from our tests that KNN is not a good model, even when using Tfid vectorizer.  

#### Naive Baynes Tfid

In [130]:
nbtfidpipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
     ])

In [132]:
nbtfidpipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()), ('nb', MultinomialNB())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'nb': MultinomialNB(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': True}

In [202]:
nbtfidpipe_params={
             'tvec__max_df': [0.3,0.5],
             'tvec__max_features': [2000, 3000],
             'tvec__min_df': [3, 4, 5],
             'tvec__ngram_range': [(1, 1),(1, 2)],
             'nb__alpha': [1.0,2.0,3.0]}

In [273]:
nbtfidpipe_params2={
             'tvec__max_df': [0.15,0.5],
             'tvec__max_features': [1000, 2000, 3000],
             'tvec__min_df': [3, 4, 5],
             'tvec__ngram_range': [(1, 1),(1, 2)],
             'nb__alpha': [1.0,2.0,3.0]}

After tuning several models, we realize that the max_df doesn't improve the score at all after a certain threshold (in our case, this threshold is 0.3). With that in mind, we will keep max_df as 0.3 for all models and set 0.5 as an option in cases where this might not stand true. 

In [204]:
nbtfidgs = GridSearchCV(
    nbtfidpipe, # what object are we optimizing?
    param_grid = nbtfidpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [275]:
nbtfidgs2 = GridSearchCV(
    nbtfidpipe, # what object are we optimizing?
    param_grid = nbtfidpipe_params2,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation

In [206]:
nbtfidgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'nb__alpha': [1.0, 2.0, 3.0],
                         'tvec__max_df': [0.3, 0.5],
                         'tvec__max_features': [2000, 3000],
                         'tvec__min_df': [3, 4, 5],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [276]:
nbtfidgs2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'nb__alpha': [1.0, 2.0, 3.0],
                         'tvec__max_df': [0.15, 0.5],
                         'tvec__max_features': [1000, 2000, 3000],
                         'tvec__min_df': [3, 4, 5],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [208]:
nbtfidgs.best_params_

{'nb__alpha': 2.0,
 'tvec__max_df': 0.3,
 'tvec__max_features': 2000,
 'tvec__min_df': 4,
 'tvec__ngram_range': (1, 1)}

In [277]:
nbtfidgs2.best_params_

{'nb__alpha': 1.0,
 'tvec__max_df': 0.15,
 'tvec__max_features': 1000,
 'tvec__min_df': 5,
 'tvec__ngram_range': (1, 2)}

In [215]:
nbtfidgs.best_score_

0.7166666666666667

In [278]:
nbtfidgs2.best_score_

0.7299999999999999

In [216]:
nbtfidgs_bestmodel = nbtfidgs.best_estimator_

In [217]:
nbtfidgs_bestmodel2 = nbtfidgs2.best_estimator_

In [218]:
nbtfidgs_bestmodel.score(X_train, y_train)

0.8626666666666667

In [279]:
nbtfidgs_bestmodel2.score(X_train, y_train)

0.8486666666666667

In [220]:
nbtfidgs_bestmodel.score(X_test, y_test)

0.706

In [280]:
nbtfidgs_bestmodel2.score(X_test, y_test)

0.686

we realize that whilst the function tends to prefer a lower max_feature score, as it gives the best_score_ it might not give the best parameter for the model to perform with unseen data. With that in mind, we should be carefull when reducing max_features to prevent overfitting. 

In [222]:
nbtfidgs_bestmodel_featurescoef = pd.DataFrame(nbtfidgs_bestmodel['nb'].feature_log_prob_,
                                     columns = nbtfidgs_bestmodel['tvec'].get_feature_names()).T

In [223]:
nbtfidgs_bestmodel_featurescoef['difference'] = nbtfidgs_bestmodel_featurescoef[0] - nbtfidgs_bestmodel_featurescoef[1]

In [459]:
 nbtfidgs_bestmodel_featurescoef['abs'] = abs(nbtfidgs_bestmodel_featurescoef[0]) - abs(nbtfidgs_bestmodel_featurescoef[1])

In [460]:
nbtfidgs_bestmodel_featurescoef.sort_values(by='difference').head(20)

Unnamed: 0,0,1,difference,abs
illness,-8.00073,-6.897895,-1.102836,1.102836
hair,-7.482818,-6.401711,-1.081106,1.081106
item,-8.061044,-7.069798,-0.991246,0.991246
period,-7.548769,-6.563307,-0.985462,0.985462
quit,-8.061044,-7.07942,-0.981623,0.981623
plot,-8.061044,-7.099599,-0.961445,0.961445
makeup,-8.061044,-7.139601,-0.921443,0.921443
gut,-8.061044,-7.144314,-0.91673,0.91673
partner,-6.715244,-5.80088,-0.914364,0.914364
combat,-8.013803,-7.109635,-0.904168,0.904168


In [461]:
nbtfidgs_bestmodel_featurescoef.sort_values(by='difference').tail(20)

Unnamed: 0,0,1,difference,abs
nothing,-6.805076,-7.59149,0.786414,-0.786414
play,-6.80717,-7.60187,0.794701,-0.794701
boy,-7.02105,-7.827088,0.806038,-0.806038
funny,-6.962598,-7.812975,0.850377,-0.850377
shit,-6.915852,-7.777658,0.861806,-0.861806
pretty,-6.810544,-7.708348,0.897805,-0.897805
talk,-6.007496,-6.931273,0.923777,-0.923777
two,-6.886772,-7.822063,0.935292,-0.935292
gf,-6.906866,-7.862169,0.955303,-0.955303
mine,-6.874215,-7.862169,0.987954,-0.987954


#### NB Classifier Cvec

In [225]:
nbcvecpipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [226]:
nbcvecpipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()), ('nb', MultinomialNB())],
 'verbose': False,
 'cvec': CountVectorizer(),
 'nb': MultinomialNB(),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': True}

In [230]:
nbcvecpipe_params={ 'cvec__max_df': [0.3,0.5, 0.75],
                    'cvec__max_features': [1000, 2000, 3000],
                    'cvec__min_df': [5, 6, 7],
                    'cvec__ngram_range': [(1, 1), (1, 2)],
                    'nb__alpha': [1.0, 2.0, 3.0]}

In [231]:
nbcvecgs = GridSearchCV(
    nbcvecpipe, # what object are we optimizing?
    param_grid = nbcvecpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [232]:
nbcvecgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__max_df': [0.3, 0.5, 0.75],
                         'cvec__max_features': [1000, 2000, 3000],
                         'cvec__min_df': [5, 6, 7],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'nb__alpha': [1.0, 2.0, 3.0]})

In [233]:
nbcvecgs.best_params_

{'cvec__max_df': 0.3,
 'cvec__max_features': 2000,
 'cvec__min_df': 6,
 'cvec__ngram_range': (1, 2),
 'nb__alpha': 2.0}

In [304]:
nbcvecgs.best_score_

0.7213333333333334

In [234]:
nbcvecgs_bestmodel = nbcvecgs.best_estimator_

In [235]:
nbcvecgs_bestmodel.score(X_train,y_train)

0.8246666666666667

In [236]:
nbcvecgs_bestmodel.score(X_test,y_test)

0.69

Notice that the model does not perform very well in all 3 datasets. 
As such we will look into what are the words that causes these misclassifications


In [297]:
nbcvecgs_bestmodel_featurescoef = pd.DataFrame(nbcvecgs_bestmodel['nb'].feature_log_prob_,
                                     columns = nbcvecgs_bestmodel['cvec'].get_feature_names()).T

In [298]:
nbcvecgs_bestmodel_featurescoef

Unnamed: 0,0,1
able,-7.023600,-6.892642
absolutely,-7.957909,-7.180324
abuse,-8.409894,-7.585789
abusive,-8.158580,-6.604960
accept,-7.220310,-7.074963
...,...,...
yesterday,-7.583216,-8.684401
yet,-7.023600,-6.892642
young,-6.800456,-6.381816
youth,-8.409894,-7.298107


In [299]:
nbcvecgs_bestmodel_featurescoef['difference'] = nbcvecgs_bestmodel_featurescoef[0] - nbcvecgs_bestmodel_featurescoef[1]

In [301]:
nbcvecgs_bestmodel_featurescoef.sort_values(by='difference').head(20)

Unnamed: 0,0,1,difference
mental illness,-9.662657,-6.738491,-2.924166
illness,-9.257192,-6.60496,-2.652233
romantic relationship,-9.662657,-7.298107,-2.364551
quit,-9.662657,-7.298107,-2.364551
member,-9.257192,-7.298107,-1.959085
hair,-8.053219,-6.119452,-1.933768
rise,-9.257192,-7.431638,-1.825554
spouse,-9.257192,-7.431638,-1.825554
trait,-9.257192,-7.431638,-1.825554
behaviour,-8.746367,-7.074963,-1.671403


In [302]:
nbcvecgs_bestmodel_featurescoef.sort_values(by='difference').tail(20)

Unnamed: 0,0,1,difference
even though,-7.311282,-8.684401,1.373119
husband,-7.311282,-8.684401,1.373119
smell,-7.264762,-8.684401,1.419639
kinda,-7.22031,-8.684401,1.464091
shit,-6.800456,-8.278936,1.47848
girl,-5.191018,-6.669498,1.47848
matter,-6.800456,-8.278936,1.47848
nothing,-6.505657,-7.991254,1.485597
pretty,-6.484603,-7.991254,1.506651
im,-7.136929,-8.684401,1.547472


**Interesting Facts in the first run**  
- All of the scores that we have right now are non-satisfactory, our target is to get an accuracy score of above 0.9 on the model.

- The best model with the best performance is logistics regression, and the worst is the KNN model.

- Overall, the TFIDF tend to perform better than the Count Vectorizer.

The Term-Frequency-Inverse-Document Frequency may perform better as it weighs down the common words occuring in almost all the documents and give more importance to the words that appear in a subset of documents. By penalising these common words, we can reduce misclassifications. 

We will try to reduce words that are causing misclassifications manualls to improve the scores. To do so we will be using linear regression coefficients as a benchmark 

In [350]:
lr_actualvspred = pd.DataFrame(columns = [])

In [352]:
lr_actualvspred

Unnamed: 0,descr
0,formally diagnose autism spectrum disorder sign symptom lead get assess diagnosed
1,moral lesson advice father male figure teach instrument journey life matter unorthodox may edit fundamental lesson learn dad unsaid rather say man word would speak something say two important thing show want achieve anything life must discipline resilient face adversity edit fundamental advice father give believe make mention never use people success metric measure success life progress life pace succumb pressure society environment
2,continued date person everyone tell break
3,favorite nonsexual activity
4,stuck mile away boyfriends girlfriend pandemic meet together month two manage keep relationship go
...,...
495,vanilla sex woman sometimes feel inadequate relation kinky porn culture
496,call hair bun language hair bun english spanish always call tomate mean tomato start call hair tomato english instead say bun cuz make laugh lol language call edit thanks everyone answer never expect many reply lol love learn language read reply much fun
497,realize grown apart old friend cope loss someone post askwomen since man go thought ask fellow
498,marry co habitating work come home housework


In [354]:
lr_actualvspred['descr'] = X_test

In [374]:
lr_actualvspred['actual'] = y_test.to_list()

In [375]:
lr_actualvspred['predicted'] = lrtvecgs_bestmodel.predict(X_test)

In [378]:
wrongly_classified_data = lr_actualvspred[lr_actualvspred['actual']!=lr_actualvspred['predicted']]

In [379]:
wrongly_classified_data

Unnamed: 0,descr,actual,predicted
2,continued date person everyone tell break,AskWomen,AskMen
3,favorite nonsexual activity,AskMen,AskWomen
4,stuck mile away boyfriends girlfriend pandemic meet together month two manage keep relationship go,AskWomen,AskMen
5,boundary non negotiable,AskMen,AskWomen
6,pratice self love see attractive without validation external source,AskMen,AskWomen
...,...,...,...
483,get back together someone cheat give second chance work,AskWomen,AskMen
485,play rpgs dm gm help safe feel welcome table especially look advice regard game player might know everyone anyone except gm table edit want thank everyone comment helpful,AskWomen,AskMen
491,stop go gym covid make gain want gain covid belly joke,AskMen,AskWomen
493,valentine day mega thread check thing gift food plan valentine day order avoid sea valentine galentine post one mega thread thread rule advice gift relax ask away also obviously ask relationship stuff monday look advice make sure descriptive succinct well information give good answer receive suggest sort new see well new stuff,AskWomen,AskMen


#### Decision Tree Cvec

In [237]:
dtcvecpipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('dt', DecisionTreeClassifier(random_state= 42))
])

In [238]:
dtcvecpipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('dt', DecisionTreeClassifier(random_state=42))],
 'verbose': False,
 'cvec': CountVectorizer(),
 'dt': DecisionTreeClassifier(random_state=42),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'dt__ccp_alpha': 0.0,
 'dt__class_weight': None,
 'dt__criterion': 'gini',
 'dt__max_depth': None,
 'dt__max_features': None,
 'dt__max_leaf_nodes': None,
 'dt__min_impurity_decrease': 0.0,
 'dt__min_impurity_split': None,
 'dt__min_samples_leaf': 1,
 'dt__min_samples_split': 2,
 'dt__min_weight_fraction_leaf': 0.0

In [274]:
dtcvecpipe_params={ 'cvec__max_df': [0.3, 0.5],
                    'cvec__max_features': [1000, 2000, 3000],
                    'cvec__min_df': [4, 5, 6],
                    'cvec__ngram_range': [(1, 1), (1, 2)],
#                     'dt__ccp_alpha': [0, 0.5, 1.0],
                    'dt__max_depth': [None, 10, 20],
                    'dt__min_samples_split': [2, 3, 4]}

In [266]:
dtcvecgs = GridSearchCV(
    dtcvecpipe, # what object are we optimizing?
    param_grid = dtcvecpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [267]:
dtcvecgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('dt',
                                        DecisionTreeClassifier(random_state=42))]),
             param_grid={'cvec__max_df': [0.3, 0.5],
                         'cvec__max_features': [1000, 2000, 3000],
                         'cvec__min_df': [4, 5, 6],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'dt__ccp_alpha': [0, 0.5, 1.0],
                         'dt__max_depth': [3, 5, 10],
                         'dt__min_samples_split': [2, 3, 4]})

In [268]:
dtcvecgs.best_params_

{'cvec__max_df': 0.3,
 'cvec__max_features': 1000,
 'cvec__min_df': 6,
 'cvec__ngram_range': (1, 2),
 'dt__ccp_alpha': 0,
 'dt__max_depth': 10,
 'dt__min_samples_split': 4}

In [269]:
dtcvecgs.best_score_

0.6826666666666668

In [270]:
dtcvecgs_bestmodel = dtcvecgs.best_estimator_

In [271]:
dtcvecgs_bestmodel.score(X_train,y_train)

0.772

In [272]:
dtcvecgs_bestmodel.score(X_test,y_test)

0.636

#### Decision Tree Tfid

In [225]:
dttvecpipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [226]:
dttvecpipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()), ('dt', DecisionTreeClassifier())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'dt': DecisionTreeClassifier(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'dt__ccp_alpha': 0.0,
 'dt__class_weight': None,
 'dt__criterion': 'gini',
 'dt__max_depth': None,
 'dt__max_features': None,
 'dt__max_leaf_nodes': None,
 'dt__min_impurity_decrease': 0.0,
 'dt__min_impurity_split': None,
 'dt__min_samples_lea

In [261]:
dttvecpipe_params={ 'tvec__max_df': [0.25, 0.3, 0.5],
                    'tvec__max_features': [2000, 3000, 4000],
                    'tvec__min_df': [4, 5, 6],
                    'tvec__ngram_range': [(1, 1), (1, 2)],
                    'dt__ccp_alpha': [0, 1],
                    'dt__max_depth': [5, 6, 7],
                    'dt__min_samples_split': [2, 3, 4]}

In [262]:
dttvecgs = GridSearchCV(
    dttvecpipe, # what object are we optimizing?
    param_grid = dttvecpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [263]:
dttvecgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('dt', DecisionTreeClassifier())]),
             param_grid={'dt__ccp_alpha': [0, 1], 'dt__max_depth': [5, 6, 7],
                         'dt__min_samples_split': [2, 3],
                         'tvec__max_df': [0.25, 0.3, 0.5],
                         'tvec__max_features': [2000, 3000, 4000],
                         'tvec__min_df': [4, 5, 6],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [264]:
dttvecgs.best_params_

{'dt__ccp_alpha': 0,
 'dt__max_depth': 6,
 'dt__min_samples_split': 3,
 'tvec__max_df': 0.3,
 'tvec__max_features': 3000,
 'tvec__min_df': 5,
 'tvec__ngram_range': (1, 1)}

In [265]:
dttvecgs_bestmodel = dttvecgs.best_estimator_

In [266]:
dttvecgs_bestmodel.score(X_train,y_train)

0.7466666666666667

In [267]:
dttvecgs_bestmodel.score(X_test,y_test)

0.644

#### Decision Tree Tfid Bagging Classifier

In [280]:
bagtvecpipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('bag', BaggingClassifier(DecisionTreeClassifier()))
])

In [281]:
bagtvecpipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()),
  ('bag', BaggingClassifier(base_estimator=DecisionTreeClassifier()))],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'bag': BaggingClassifier(base_estimator=DecisionTreeClassifier()),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'bag__base_estimator__ccp_alpha': 0.0,
 'bag__base_estimator__class_weight': None,
 'bag__base_estimator__criterion': 'gini',
 'bag__base_estimator__max_depth': None,
 'bag

In [282]:
bagtvecpipe_params={'tvec__max_df': [0.25, 0.3, 0.5],
                    'tvec__max_features': [2000, 3000, 4000],
                    'tvec__min_df': [4, 5, 6],
                    'tvec__ngram_range': [(1, 1), (1, 2)],
                    'bag__base_estimator__ccp_alpha': [0, 1],
                    'bag__base_estimator__max_depth': [5, 6, 7],
                    'bag__base_estimator__min_samples_split': [2, 3, 4],
                    'bag__n_estimators': [30]}

In [283]:
bagtvecgs = GridSearchCV(
    bagtvecpipe, # what object are we optimizing?
    param_grid = bagtvecpipe_params,
    cv=5) # what parameters values are we searching) # 5-fold cross-validation.

In [284]:
bagtvecgs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('bag',
                                        BaggingClassifier(base_estimator=DecisionTreeClassifier()))]),
             param_grid={'bag__base_estimator__ccp_alpha': [0, 1],
                         'bag__base_estimator__max_depth': [5, 6, 7],
                         'bag__base_estimator__min_samples_split': [2, 3, 4],
                         'bag__n_estimators': [10, 20, 30],
                         'tvec__max_df': [0.25, 0.3, 0.5],
                         'tvec__max_features': [2000, 3000, 4000],
                         'tvec__min_df': [4, 5, 6],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [285]:
bagtvecgs.best_params_

{'bag__base_estimator__ccp_alpha': 0,
 'bag__base_estimator__max_depth': 7,
 'bag__base_estimator__min_samples_split': 4,
 'bag__n_estimators': 30,
 'tvec__max_df': 0.3,
 'tvec__max_features': 3000,
 'tvec__min_df': 5,
 'tvec__ngram_range': (1, 1)}

In [286]:
bagtvecgs_bestmodel = bagtvecgs.best_estimator_

In [287]:
bagtvecgs_bestmodel.score(X_train,y_train)

0.8006666666666666

In [288]:
bagtvecgs_bestmodel.score(X_test,y_test)

0.686