In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
df_sub = pd.read_csv('../data/df_all_subm.csv')
df_com = pd.read_csv('../data/df_all_comm.csv')
df_all = pd.read_csv('../data/df_all_text.csv')

In [3]:
X_sub = df_sub['title']
y_sub = df_sub['is_news']

# split training set and final test set
XS_training, XS_final, yS_training, yS_final = train_test_split(X_sub, y_sub, test_size = 0.1, stratify = y_sub)

XS_training.shape, yS_training.shape, XS_final.shape, yS_final.shape

((1456,), (1456,), (162,), (162,))

In [4]:
# repeat for comments df
X_com = df_com['body']
y_com = df_com['is_news']

XC_training, XC_final, yC_training, yC_final = train_test_split(X_com, y_com, test_size = 0.1, stratify = y_com)

XC_training.shape, yC_training.shape, XC_final.shape, yC_final.shape

((1586,), (1586,), (177,), (177,))

In [5]:
# split training sets into train-test for initial modeling
XS_train, XS_test, yS_train, yS_test = train_test_split(XS_training, yS_training, stratify = yS_training)

XC_train, XC_test, yC_train, yC_test = train_test_split(XC_training, yC_training, stratify = yC_training)

In [6]:
def my_lemma(item):
    lemma = WordNetLemmatizer()
    split_items = item.split(' ')
    lemmatized = []
    for i in split_items:
        lemmatized.append(lemma.lemmatize(i))
    return lemmatized

Xa_training, Xa_final, ya_training, ya_final = train_test_split(X_a, y_a, test_size = 0.1, stratify = y_a)

Xa_train, Xa_test, ya_train, ya_test = train_test_split(Xa_training, ya_training, stratify = ya_training)

---
## Random Forest CVEC/TVEC

In [8]:
rfc_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rfc', RandomForestClassifier())
])

rfc_cvec_params = {
    'cvec__stop_words' : ['english', None],
    'cvec__ngram_range': [(1,1), (2,2), (3,3), (1,2),(1,3)],
    'rfc__n_estimators': [100, 200, 300]
}

gs_rfc_cvec = GridSearchCV(rfc_cvec_pipe, param_grid = rfc_cvec_params)

gs_rfc_cvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('rfc', RandomForestClassifier())]),
             param_grid={'cvec__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2),
                                               (1, 3)],
                         'cvec__stop_words': ['english', None],
                         'rfc__n_estimators': [100, 200, 300]})

In [9]:
gs_rfc_cvec.best_params_

{'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None,
 'rfc__n_estimators': 300}

In [10]:
gs_rfc_cvec.score(XS_train, yS_train), gs_rfc_cvec.score(XS_test, yS_test), gs_rfc_cvec.score(XS_final, yS_final)

(1.0, 0.7802197802197802, 0.8641975308641975)

---

In [11]:
rfc_tvec_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

rfc_tvec_params = {
    'tfidf__stop_words': ['english', None],
    'tfidf__ngram_range': [(1,1), (2,2), (3,3), (1,2),(1,3)],
    'rfc__n_estimators': range(100, 501, 50)
}

gs_rfc_tvec = GridSearchCV(rfc_tvec_pipe, param_grid = rfc_tvec_params)

gs_rfc_tvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('rfc', RandomForestClassifier())]),
             param_grid={'rfc__n_estimators': range(100, 501, 50),
                         'tfidf__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2),
                                                (1, 3)],
                         'tfidf__stop_words': ['english', None]})

In [12]:
gs_rfc_tvec.best_params_

{'rfc__n_estimators': 200,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

In [13]:
gs_rfc_tvec.score(XS_train, yS_train), gs_rfc_tvec.score(XS_test, yS_test), gs_rfc_tvec.score(XS_final, yS_final)

(1.0, 0.7802197802197802, 0.8209876543209876)

In [14]:
gs_rfc_tvec.score(XC_train, yC_train), gs_rfc_tvec.score(XC_test, yC_test), gs_rfc_tvec.score(XC_final, yC_final)

(0.5290159798149706, 0.5214105793450882, 0.5536723163841808)

---
## Logistic CVEC/TVEC

In [15]:
log_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('logr', LogisticRegression())
])

log_cvec_param = {
    'cvec__stop_words' : ['english', None],
    'cvec__ngram_range': [(1,1), (2,2), (3,3), (1,2),(1,3)],
    'logr__max_iter': range(50, 501, 50)
}


gs_log_cvec = GridSearchCV(log_cvec_pipe, param_grid = log_cvec_param)

gs_log_cvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('logr', LogisticRegression())]),
             param_grid={'cvec__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2),
                                               (1, 3)],
                         'cvec__stop_words': ['english', None],
                         'logr__max_iter': range(50, 501, 50)})

In [16]:
gs_log_cvec.best_params_

{'cvec__ngram_range': (1, 1), 'cvec__stop_words': None, 'logr__max_iter': 50}

In [17]:
gs_log_cvec.score(XS_train, yS_train), gs_log_cvec.score(XS_test, yS_test), gs_log_cvec.score(XS_final, yS_final)

(0.9935897435897436, 0.8159340659340659, 0.8641975308641975)

In [18]:
gs_log_cvec.score(XC_train, yC_train), gs_log_cvec.score(XC_test, yC_test), gs_log_cvec.score(XC_final, yC_final)

(0.5534062237174096, 0.5415617128463476, 0.576271186440678)

---


In [19]:
log_tvec_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logr', LogisticRegression())
])

log_tvec_param = {
    'tvec__stop_words' : ['english', None],
    'tvec__ngram_range': [(1,1), (2,2), (3,3), (1,2),(1,3)],
    'logr__max_iter': range(50, 501, 50)
}


gs_log_tvec = GridSearchCV(log_tvec_pipe, param_grid = log_tvec_param)

gs_log_tvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('logr', LogisticRegression())]),
             param_grid={'logr__max_iter': range(50, 501, 50),
                         'tvec__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2),
                                               (1, 3)],
                         'tvec__stop_words': ['english', None]})

In [20]:
gs_log_tvec.best_params_

{'logr__max_iter': 50, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': None}

In [21]:
gs_log_tvec.score(XS_train, yS_train), gs_log_tvec.score(XS_test, yS_test), gs_log_tvec.score(XS_final, yS_final)

(0.9010989010989011, 0.7857142857142857, 0.8271604938271605)

In [22]:
gs_log_tvec.score(XC_train, yC_train), gs_log_tvec.score(XC_test, yC_test), gs_log_tvec.score(XC_final, yC_final)

(0.5803195962994113, 0.5667506297229219, 0.6214689265536724)

--- 
## ExtraTrees CVEC/TVEC

In [23]:
ext_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('ext', ExtraTreesClassifier())
])

ext_cvec_param = {
    'cvec__stop_words' : ['english', None],
    'cvec__ngram_range': [(1,1), (2,2), (3,3), (1,2),(1,3)]
}

gs_ext_cvec = GridSearchCV(ext_cvec_pipe, param_grid = ext_cvec_param)

gs_ext_cvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('ext', ExtraTreesClassifier())]),
             param_grid={'cvec__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2),
                                               (1, 3)],
                         'cvec__stop_words': ['english', None]})

In [24]:
gs_ext_cvec.best_params_

{'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}

In [25]:
gs_ext_cvec.score(XS_train, yS_train), gs_ext_cvec.score(XS_test, yS_test), gs_ext_cvec.score(XS_final, yS_final)

(1.0, 0.7994505494505495, 0.8641975308641975)

In [26]:
ext_tvec_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

ext_tvec_param = {
    'tvec__stop_words' : ['english', None],
    'tvec__ngram_range': [(1,1), (2,2), (3,3), (1,2),(1,3)]
}

gs_ext_tvec = GridSearchCV(ext_tvec_pipe, param_grid = ext_tvec_param)

gs_ext_tvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('rfc', RandomForestClassifier())]),
             param_grid={'tvec__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2),
                                               (1, 3)],
                         'tvec__stop_words': ['english', None]})

In [27]:
gs_ext_tvec.best_params_

{'tvec__ngram_range': (1, 1), 'tvec__stop_words': None}

In [28]:
gs_ext_tvec.score(XS_train, yS_train), gs_ext_tvec.score(XS_test, yS_test), gs_ext_tvec.score(XS_final, yS_final)

(1.0, 0.7747252747252747, 0.8024691358024691)

---
## GBoost CVEC/TVEC

In [29]:
gb_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('gb', GradientBoostingClassifier())
])

gb_cvec_param = {
    'cvec__stop_words' : ['english', None],
    'cvec__ngram_range': [(1,1), (2,2), (3,3), (1,2),(1,3)],
    'gb__learning_rate': [0.05, 0.1, 0.15, 0.2],
    'gb__n_estimators': [200, 300]
}

gs_gb_cvec = GridSearchCV(gb_cvec_pipe, param_grid = gb_cvec_param)

gs_gb_cvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('gb', GradientBoostingClassifier())]),
             param_grid={'cvec__ngram_range': [(1, 1), (2, 2), (3, 3), (1, 2),
                                               (1, 3)],
                         'cvec__stop_words': ['english', None],
                         'gb__learning_rate': [0.05, 0.1, 0.15, 0.2],
                         'gb__n_estimators': [200, 300]})

In [30]:
gs_gb_cvec.best_params_

{'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None,
 'gb__learning_rate': 0.2,
 'gb__n_estimators': 300}

In [31]:
gs_gb_cvec.score(XS_train, yS_train), gs_gb_cvec.score(XS_test, yS_test), gs_gb_cvec.score(XS_final, yS_final)

(0.9926739926739927, 0.7912087912087912, 0.7901234567901234)

---

In [32]:
gb_tvec_pipe = Pipeline([
    ('tvec', CountVectorizer()),
    ('gb', GradientBoostingClassifier())
])

gb_tvec_param = {
    'tvec__tokenizer': [None, my_lemma],
    'tvec__ngram_range': [(1,2),(1,3)],
    'gb__learning_rate': [0.1, 0.2],
    'gb__n_estimators': [150, 200]
}

gs_gb_tvec = GridSearchCV(gb_tvec_pipe, param_grid = gb_tvec_param)

gs_gb_tvec.fit(XS_train, yS_train)

GridSearchCV(estimator=Pipeline(steps=[('tvec', CountVectorizer()),
                                       ('gb', GradientBoostingClassifier())]),
             param_grid={'gb__learning_rate': [0.1, 0.2],
                         'gb__n_estimators': [150, 200],
                         'tvec__ngram_range': [(1, 2), (1, 3)],
                         'tvec__tokenizer': [None,
                                             <function my_lemma at 0x7fc610ad8ee0>]})

In [33]:
gs_gb_tvec.best_params_

{'gb__learning_rate': 0.2,
 'gb__n_estimators': 200,
 'tvec__ngram_range': (1, 3),
 'tvec__tokenizer': <function __main__.my_lemma(item)>}

In [34]:
gs_gb_tvec.score(XS_train, yS_train), gs_gb_tvec.score(XS_test, yS_test), gs_gb_tvec.score(XS_final, yS_final)

(0.9990842490842491, 0.7939560439560439, 0.7777777777777778)