## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt

from bs4 import BeautifulSoup

import os
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction import stop_words
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import string

from sklearn.model_selection import train_test_split

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier



In [2]:
files = [f for f in os.listdir('./press_releases/') if 'csv' in f]

In [3]:
df = pd.DataFrame()

for file in files:
    new = pd.read_csv(f'./press_releases/{file}')
    df = pd.concat([df,new],ignore_index=True)

df

Unnamed: 0,full_link,time,title,body,html,full_text,year,label
0,https://www.apple.com/newsroom/2021/03/apple-w...,2021-03-01,Apple Women’s Health Study releases preliminar...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal N...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",Apple Women’s Health Study releases preliminar...,2021.0,after
1,https://www.apple.com/newsroom/2021/03/apple-t...,2021-03-01,Apple TV+ announces programming partnership wi...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal N...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",Apple TV+ announces programming partnership wi...,2021.0,after
2,https://www.apple.com/newsroom/2021/03/apple-h...,2021-03-01,Apple Hearing Study shares new insights on hea...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal N...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",Apple Hearing Study shares new insights on hea...,2021.0,after
3,https://www.apple.com/newsroom/2021/03/new-zea...,2021-03-01,New Zealand students prototype their own Samoa...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal N...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",New Zealand students prototype their own Samoa...,2021.0,after
4,https://www.apple.com/newsroom/2021/02/apple-c...,2021-02-01,Apple celebrates Women’s History Month and Int...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal N...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",Apple celebrates Women’s History Month and Int...,2021.0,after
...,...,...,...,...,...,...,...,...
1564,https://press.aboutamazon.com/news-releases/ne...,2021-01-19 00:00:00,Amazon.com to Webcast Fourth Quarter 2020 Fina...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon.com to Webcast Fourth Quarter 2020 Fina...,2021.0,after
1565,https://press.aboutamazon.com/news-releases/ne...,2021-01-19 00:00:00,Amazon Teams Up with Pharrell Williams’ YELLOW...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Teams Up with Pharrell Williams’ YELLOW...,2021.0,after
1566,https://press.aboutamazon.com/news-releases/ne...,2021-01-11 00:00:00,"Amazon Expands Investment in Metro Detroit, Cr...",\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...","Amazon Expands Investment in Metro Detroit, Cr...",2021.0,after
1567,https://press.aboutamazon.com/news-releases/ne...,2021-01-06 00:00:00,Amazon Launches $2 Billion Housing Equity Fund...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Launches $2 Billion Housing Equity Fund...,2021.0,after


In [4]:
df.label.value_counts(normalize=True)

# baseline is .618228

before    0.618228
after     0.381772
Name: label, dtype: float64

## Vectorizing

In [5]:
X = df['full_text']  # the features we want to analyze
y = df['label']  # the labels, or answers, we want to test against
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=47,
                                                    stratify=y)

In [14]:
# https://python.plainenglish.io/text-classification-using-python-spacy-7a414abcc83a


# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors


# Creating our tokenizer function
def spacy_tokenizer(text):
    # Creating our token object, which is used to create
    # documents with linguistic annotations.
    mytokens = nlp(text)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]

    # Removing stop words
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # return preprocessed list of tokens
    return mytokens


# Custom transformer using spaCy

class predictors(TransformerMixin): 
    
    def transform(self, X, **transform_params):
        # Basic function to clean the text
        def clean_text(text):
            # Removing spaces and converting text into lowercase
            return text.strip().lower()
        
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, max_features=2000)

## Logistic Regression

In [15]:
# Logistic Regression Classifier - penalty: 'l2'

classifier = LogisticRegression(penalty='l2')

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)
print('Train:', pipe.score(X_train,y_train))
print('Test:', pipe.score(X_test,y_test))

# baseline is .618228

# Train: 0.8743169398907104
# Test: 0.7834394904458599

Train: 0.8743169398907104
Test: 0.7834394904458599


In [16]:
# Logistic Regression Classifier - penalty: 'none'

classifier = LogisticRegression(penalty='none')

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)
print('Train:', pipe.score(X_train,y_train))
print('Test:', pipe.score(X_test,y_test))

# baseline is .618228

# Train: 1.0
# Test: 0.8535031847133758

Train: 1.0
Test: 0.8535031847133758


## KNN

In [17]:
# KNN Classifier - Neighbors = 8

knn = KNeighborsClassifier(n_neighbors=8,n_jobs=-1)

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('knn', knn)])

# model generation
pipe.fit(X_train,y_train)
print('Train:', pipe.score(X_train,y_train))
print('Test:', pipe.score(X_test,y_test))

# baseline is .618228

# Train: 0.7996357012750456
# Test: 0.7643312101910829

Train: 0.7996357012750456
Test: 0.7643312101910829


In [18]:
# KNN Classifier - Neighbors = 5

knn = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('knn', knn)])

# model generation
pipe.fit(X_train,y_train)
print('Train:', pipe.score(X_train,y_train))
print('Test:', pipe.score(X_test,y_test))

# baseline is .618228

# Train: 0.8187613843351548
# Test: 0.7473460721868365

Train: 0.8187613843351548
Test: 0.7473460721868365


In [19]:
# KNN Classifier - Neighbors = 3

knn = KNeighborsClassifier(n_neighbors=3,n_jobs=-1)

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('knn', knn)])

# model generation
pipe.fit(X_train,y_train)
print('Train:', pipe.score(X_train,y_train))
print('Test:', pipe.score(X_test,y_test))

# baseline is .618228

# Train: 0.8588342440801457
# Test: 0.7303609341825902

Train: 0.8588342440801457
Test: 0.7303609341825902
