In [59]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, BatchNormalization
from tensorflow.keras.regularizers import l2

In [2]:
fitness_df = pd.read_csv('../Data/fitness_clean.csv', index_col=0)
bodyweight_df = pd.read_csv('../Data/bodyweight_clean.csv', index_col=0)

all_posts = pd.concat([fitness_df, bodyweight_df], ignore_index=True)

In [3]:
all_posts['subreddit'].value_counts()

bodyweightfitness    7052
Fitness              6711
Name: subreddit, dtype: int64

In [4]:
# null model score
7052 / (7052+6711)

0.512388287437332

In [15]:
X = all_posts['selftext']
y = np.where(all_posts['subreddit'] =='bodyweightfitness', 1, 0)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [60]:
# Vectorize input
cvect = CountVectorizer(stop_words='english',
                       ngram_range=(1,2),
                       max_features=1000)
X_train_vect = cvect.fit_transform(X_train).toarray()
X_test_vect = cvect.transform(X_test).toarray()

#Scale
sscaler = StandardScaler()
X_train_scaled = sscaler.fit_transform(X_train_vect)
X_test_scaled = sscaler.transform(X_test_vect)

In [69]:
model = Sequential()
model.add(Input(shape=(1000,)))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.1)))
model.add(Dense(1, activation='sigmoid'))

In [70]:
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy'])

In [73]:
history = model.fit(
    X_train_scaled, 
    y_train,
    validation_split=.2,
    epochs=10
)



In [74]:
accuracy_score(y_test, np.where(model.predict(X_test_vect) > .5, 1, 0))

0.7497820401046208

## Custom Preprocessor

In [7]:
# using isalnum method to remove special characters from
# https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string

def my_preprocessor(text):
    text = text.lower()
    text = text.replace("'", '')
    text = ''.join(w if w.isalnum() or w == ' ' else ' ' for w in text)
    
    return text

In [18]:
pipe2 = make_pipeline(
    TfidfVectorizer(preprocessor=my_preprocessor,
                    max_features=400, 
                    ngram_range=(1,2),
                    stop_words='english'), 
    StandardScaler(with_mean=False),
    SVC(C=10, gamma=2)
)

In [19]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=400, ngram_range=(1, 2),
                                 preprocessor=<function my_preprocessor at 0x7f8aadcc8310>,
                                 stop_words='english')),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('svc', SVC(C=10, gamma=2))])

In [20]:
pipe2.score(X_train, y_train)

0.9968998256151909

In [21]:
pipe2.score(X_test, y_test)

0.5106073815751235