In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
dead = pd.read_csv("./dead.csv")
phish = pd.read_csv("./phish.csv")

In [4]:
dead.drop(columns = [col for col in dead.columns if col not in ["title", "subreddit"]],
          inplace=True)

phish.drop(columns = [col for col in phish.columns if col not in ["title", "subreddit"]],
          inplace=True)

In [14]:
jam = dead.merge(phish, how="outer")
jam["subreddit"] = jam["subreddit"].map({"gratefuldead": 0,
                                         "phish": 1})
jam.rename(columns = {"title": "text"}, inplace=True);

In [27]:
X = jam["text"]
y = jam["subreddit"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 42,
                                                    stratify = y)

cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_cv, y_train)

print(f"number of features: {len(cv.get_feature_names())}")
print("")
print(f"""training accuracy score: {lr.score(X_train_cv, y_train)}
 testing accuracy score: {lr.score(X_test_cv, y_test)}""")

number of features: 7227

training accuracy score: 0.9497251488776912
 testing accuracy score: 0.8210237031947785


In [16]:
len(jam[jam["text"].str.contains("\n")])

0

In [17]:
len(jam[jam["text"].str.contains(r"\[deleted\]")])

0

In [18]:
len(jam[jam["text"].str.contains(r"\[removed\]")])

0

In [19]:
def drop_url(text):
    text_list = text.split()
    url_tags = ["http", ".com", "www.", ".org", ".net", "&amp", "width=", "size=", "width=",
                "height=", "style=", "scrolling=", "allowFullScreen=", "frameborder=", 
                "allowTransparency=", "iframe", "&gt", "&lt"]
    
    filtered_list = [word for word in text_list if any(tag in word for tag in url_tags) == False]
    
    return " ".join(filtered_list)

In [20]:
jam["text"] = jam["text"].map(lambda x: drop_url(x))

In [22]:
len(jam[jam["text"] == ""])

15

In [23]:
jam = jam.drop(jam[jam["text"] == ""].index)
jam.reset_index(inplace=True, drop=True)

In [24]:
X = jam["text"]
y = jam["subreddit"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 42,
                                                    stratify = y)

cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_cv, y_train)

print(f"number of features: {len(cv.get_feature_names())}")
print("")
print(f"""training accuracy score: {lr.score(X_train_cv, y_train)}
 testing accuracy score: {lr.score(X_test_cv, y_test)}""")

number of features: 7227

training accuracy score: 0.9497251488776912
 testing accuracy score: 0.8210237031947785


In [28]:
from sklearn import svm

In [36]:
X = jam["text"]
y = jam["subreddit"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 42,
                                                    stratify = y)

pipe_tv_sv_2 = Pipeline([
    ("tvec", TfidfVectorizer()),
    ("svc", svm.SVC())
])

pipe_params = {
    "tvec__max_features"    : [None],
    "tvec__ngram_range"     : [(1,1), (1,2)],
    "tvec__stop_words"      : ["english"],
    "svc__C"                : [10, 100],
    "svc__kernel"           : ["rbf"],
    "svc__gamma"            : ["scale"]
}

gs_tv_sv_2 = GridSearchCV(pipe_tv_sv_2, param_grid=pipe_params, cv=5)
gs_tv_sv_2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tvec__max_features': [None], 'tvec__ngram_range': [(1, 1), (1, 2)], 'tvec__stop_words': ['english'], 'svc__C': [10, 100], 'svc__kernel': ['rbf'], 'svc__gamma': ['scale']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
gs_tv_sv_2.best_params_

{'svc__C': 10,
 'svc__gamma': 'scale',
 'svc__kernel': 'rbf',
 'tvec__max_features': None,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

In [33]:
gs_tv_sv_2.score(X_train, y_train)

0.9147961520842877

In [34]:
gs_tv_sv_2.score(X_test, y_test)

0.8275506698728959

In [37]:
from sklearn.naive_bayes import MultinomialNB

In [38]:
X = jam["text"]
y = jam["subreddit"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 42,
                                                    stratify = y)

pipe_cv_nb = Pipeline([
    ("cvec", CountVectorizer()),
    ("nb",   MultinomialNB())
])

pipe_params = {
    "cvec__max_features" : [None, 1_000, 2_000, 5_000, 10_000],
    "cvec__ngram_range"  : [(1,1), (1,2)],
    "cvec__stop_words"   : [None, "english"],
    "nb__alpha"          : [0, 1, 0.1, 0.5, 5]
}

gs_cv_nb = GridSearchCV(pipe_cv_nb, param_grid=pipe_params, cv=5)
gs_cv_nb.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__max_features': [None, 1000, 2000, 5000, 10000], 'cvec__ngram_range': [(1, 1), (1, 2)], 'cvec__stop_words': [None, 'english'], 'nb__alpha': [0, 1, 0.1, 0.5, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
gs_cv_nb.best_params_

{'cvec__max_features': None,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'nb__alpha': 0.5}

In [40]:
gs_cv_nb.score(X_train, y_train)

0.9753779202931745

In [41]:
gs_cv_nb.score(X_test, y_test)

0.8395740295431123