## Imports

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
# Define dataframe path
df_path = "../data/final_dataset.csv"

In [3]:
# load data
df = pd.read_csv(df_path).drop(columns="Unnamed: 0")

# Get some basic information 
print(f"Sample Data :{df.head()}")
print(f"\n--------------------------\n\n Columns : {[i for i in df.columns]}")
print(f"\n--------------------------\n\n Size of the dataset : {df.shape[0]}")
print(f"\n--------------------------\n\n Total number of columns : {df.shape[1]}")

Sample Data :            author                                        description  \
0          TuaAnon  yes, it's an lte watch with data turned on, no...   
1  DemandScary1934  how accurate is the active/total calorie track...   
2         Vinumite  fell very hard blackout drunk and lost watch. ...   
3          Damarou  pls tell me i‘m not the only one who has such ...   
4       ThorNike13  deleted apps in watch app on iphone with weird...   

    subreddit  des_word_count  
0  AppleWatch              48  
1  AppleWatch              10  
2  AppleWatch              59  
3  AppleWatch              15  
4  AppleWatch              15  

--------------------------

 Columns : ['author', 'description', 'subreddit', 'des_word_count']

--------------------------

 Size of the dataset : 5270

--------------------------

 Total number of columns : 4


In [4]:
# Drop unusefull columns
df.drop(columns =["author", "des_word_count"], inplace=True)

In [5]:
# Create label column (target column)
df["subreddit"] = df["subreddit"].map({"GalaxyWatch": 1, "AppleWatch":0})

## Functions

In [6]:
# cite: Got help form Katie Sylvia
# a function for stemming
def stem_words(text):
    # Initialize the Porter stemmer
    stemmer = PorterStemmer()
    
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Stem each word and join them back into a string
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = " ".join(stemmed_words)
    
    return stemmed_text

## Baselin Score

In [8]:
# Define X (features) and y(target)
X = df["description"]
y = df["subreddit"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
print(f"X shape ---------- {X.shape}")
print(f"y shape ---------- {y.shape}")

X shape ---------- (5270,)
y shape ---------- (5270,)


### Baseline accuracy

In [9]:
# Get the ratio of classes
y_test.value_counts(normalize=True)

0    0.534143
1    0.465857
Name: subreddit, dtype: float64

## Review The Models Metrics

In [19]:
# Instantiate the pipelines (without Bagged Decision Tree and AdaBoost because of their poor
# performance comparing to their modelling groups)
# Logistic Regression pipeline
pipe_lr = Pipeline([
    ("cvec", CountVectorizer()), # The model performed better with CountVectorizer
    ("lr", LogisticRegression())
])

# Multinomial Naïve Bayes pipeline
pipe_nb = Pipeline([
    ("cvec", CountVectorizer()), # The model performed better with CountVectorizer
    ("nb", MultinomialNB())
])

# Random Forest pipeline
pipe_rf = Pipeline([
    ("tvec", TfidfVectorizer()), # The model performed better with TfidfVectorizer
    ("rf", RandomForestClassifier())
])

# Extra Trees pipeline
pipe_et = Pipeline([
    ("tvec", TfidfVectorizer()), # The model performed better with TfidfVectorizer
    ("et", ExtraTreesClassifier())
])

# Gradient Boost pipeline
pipe_gb = Pipeline([
    ("cvec", CountVectorizer()), # The model performed better with CountVectorizer
    ("gb", GradientBoostingClassifier())
])

# XGBoost pipeline
pipe_xgb = Pipeline([
    ("cvec", CountVectorizer()), # The model performed better with CountVectorizer
    ("xgb", xgb.XGBClassifier())
])

# SVM pipeline
pipe_svc = Pipeline([
    ("tvec", TfidfVectorizer()), # The model performed better with TfidfVectorizer
    ("svc", SVC())
])

# Make a list of pipelines
pipelines = [pipe_lr, pipe_nb, pipe_rf, pipe_et, pipe_gb, pipe_xgb, pipe_svc]