## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
# Define dataframe path
df_path = "../data/final_dataset.csv"

In [3]:
# load data
df = pd.read_csv(df_path).drop(columns="Unnamed: 0")

# Get some basic information 
print(f"Sample Data :{df.head()}")
print(f"\n--------------------------\n\n Columns : {[i for i in df.columns]}")
print(f"\n--------------------------\n\n Size of the dataset : {df.shape[0]}")
print(f"\n--------------------------\n\n Total number of columns : {df.shape[1]}")

Sample Data :            author                                        description  \
0          TuaAnon  yes, it's an lte watch with data turned on, no...   
1  DemandScary1934  how accurate is the active/total calorie track...   
2         Vinumite  fell very hard blackout drunk and lost watch. ...   
3          Damarou  pls tell me i‘m not the only one who has such ...   
4       ThorNike13  deleted apps in watch app on iphone with weird...   

    subreddit  des_word_count  
0  AppleWatch              48  
1  AppleWatch              10  
2  AppleWatch              59  
3  AppleWatch              15  
4  AppleWatch              15  

--------------------------

 Columns : ['author', 'description', 'subreddit', 'des_word_count']

--------------------------

 Size of the dataset : 5270

--------------------------

 Total number of columns : 4


In [4]:
# Drop unusefull columns
df.drop(columns =["author", "des_word_count"], inplace=True)

In [5]:
# Create label column (target column)
df["subreddit"] = df["subreddit"].map({"GalaxyWatch": 1, "AppleWatch":0})

## Functions

In [6]:
# cite: Got help form Katie Sylvia
# a function for stemming
def stem_words(text):
    # Initialize the Porter stemmer
    stemmer = PorterStemmer()
    
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Stem each word and join them back into a string
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = " ".join(stemmed_words)
    
    return stemmed_text

## Baselin Score

In [7]:
# Define X (features) and y(target)
X = df["description"]
y = df["subreddit"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
print(f"X shape ---------- {X.shape}")
print(f"y shape ---------- {y.shape}")

X shape ---------- (5270,)
y shape ---------- (5270,)


### Baseline accuracy

In [8]:
# Get the ratio of classes
y_test.value_counts(normalize=True)

0    0.534143
1    0.465857
Name: subreddit, dtype: float64

## Review The Models Metrics

In [9]:
# Instantiate the pipelines (without Bagged Decision Tree and AdaBoost because of their poor
# performance comparing to their modelling groups)
# Logistic Regression pipeline
pipe_lr = Pipeline([
    ("tvec", TfidfVectorizer()), # The model performed better with TfidfVectorizer
    ("lr", LogisticRegression(solver="saga"))
])

# Multinomial Naïve Bayes pipeline
pipe_nb = Pipeline([
    ("cvec", CountVectorizer()), # The model performed better with CountVectorizer
    ("nb", MultinomialNB())
])

# Random Forest pipeline
pipe_rf = Pipeline([
    ("tvec", TfidfVectorizer()), # The model performed better with TfidfVectorizer
    ("rf", RandomForestClassifier())
])

# Extra Trees pipeline
pipe_et = Pipeline([
    ("tvec", TfidfVectorizer()), # The model performed better with TfidfVectorizer
    ("et", ExtraTreesClassifier())
])

# Gradient Boost pipeline
pipe_gb = Pipeline([
    ("cvec", CountVectorizer()), # The model performed better with CountVectorizer
    ("gb", GradientBoostingClassifier())
])

# XGBoost pipeline
pipe_xgb = Pipeline([
    ("cvec", CountVectorizer()), # The model performed better with CountVectorizer
    ("xgb", xgb.XGBClassifier())
])

# SVM pipeline
pipe_svc = Pipeline([
    ("tvec", TfidfVectorizer()), # The model performed better with TfidfVectorizer
    ("svc", SVC())
])

# Make a list of pipelines
pipelines = [pipe_lr, pipe_nb, pipe_rf, pipe_et, pipe_gb, pipe_xgb, pipe_svc]

In [10]:
# Define parameters for each pipe

param_lr = {'lr__C': [100],                     # [0.1, 1, 10, 100] tested and 100 was chosen as the best param
            'lr__penalty': ['l1'],              # ["none", "l1", "l2"] tested and l1 was chosen as the best param
            'tvec__max_df': [0.95],             # [.9, .95] tested and 0.95 was chosen as the best param
            'tvec__max_features': [20000],      # [5_000, 10_000, 20_000, 30_000] tested and 20000 was chosen as the best param
            'tvec__min_df': [2],                # [2, 3] tested and 2 was chosen as the best param
            'tvec__ngram_range': [(1, 2)],      # [(1, 1), (1, 2)] tested and (1, 2) was chosen as the best param
            'tvec__stop_words': [None],         # [None, "english"] tested and None was chosen as the best param
            'tvec__tokenizer': [None]}          # [None, stem_words] tested and None was chosen as the best param

param_nb = {'cvec__max_df': [0.9],              # [.9, .95] tested and 0.9 was chosen as the best param
            'cvec__max_features': [10000],      # [5_000, 10_000, 20_000, 30_000] tested and 10000 was chosen as the best param
            'cvec__min_df': [2],                # [2, 3] tested and 2 was chosen as the best param
            'cvec__ngram_range': [(1, 2)],      # [(1, 1), (1, 2)] tested and (1, 2) was chosen as the best param
            'cvec__stop_words': ['english'],    # [None, "english"] tested and 'english' was chosen as the best param
            'cvec__tokenizer': [None]}          # [None, stem_words] tested and None was chosen as the best param

param_rf = {'rf__max_depth': [None],            # [None, 1, 2, 3, 4] tested and None was chosen as the best param
            'rf__n_estimators': [100],          # The only estimator
            'tvec__max_df': [0.95],             # [.9, .95] tested and 0.95 was chosen as the best param
            'tvec__max_features': [30000],      # [5_000, 10_000, 20_000, 30_000] tested and 30000 was chosen as the best param 
            'tvec__min_df': [2],                # [2, 3] tested and 2 was chosen as the best param
            'tvec__ngram_range': [(1, 1)],      # [(1, 1), (1, 2)] tested and (1, 1) was chosen as the best param
            'tvec__stop_words': ['english'],    # [None, "english"] tested and 'english' was chosen as the best param
            'tvec__tokenizer': [None]}          # [None, stem_words] tested and None was chosen as the best param

param_et = {'et__max_depth': [None],            # [None, 1, 2, 3, 4] tested and None was chosen as the best param
            'et__n_estimators': [100],          # The only estimator
            'tvec__max_df': [0.95],             # [.9, .95] tested and 0.95 was chosen as the best param
            'tvec__max_features': [30000],      # [5_000, 10_000, 20_000, 30_000] tested and 30000 was chosen as the best param 
            'tvec__min_df': [2],                # [2, 3] tested and 2 was chosen as the best param
            'tvec__ngram_range': [(1, 1)],      # [(1, 1), (1, 2)] tested and (1, 1) was chosen as the best param
            'tvec__stop_words': ['english'],    # [None, "english"] tested and 'english' was chosen as the best param
            'tvec__tokenizer': [None]}          # [None, stem_words] tested and None was chosen as the best param

param_gb = {'cvec__max_df': [0.9],              # [.9, .95] tested and 0.9 was chosen as the best param
            'cvec__max_features': [5000],       # [5_000, 10_000, 20_000, 30_000] tested and 5000 was chosen as the best param 
            'cvec__min_df': [2],                # [2, 3] tested and 2 was chosen as the best param
            'cvec__ngram_range': [(1, 2)],      # [(1, 1), (1, 2)] tested and (1, 2) was chosen as the best param
            'cvec__stop_words': [None],         # [None, "english"] tested and None was chosen as the best param
            'cvec__tokenizer': [stem_words],    # [None, stem_words] tested and stem_words was chosen as the best param
            'gb__learning_rate': [0.1],         # [0.1, 1, 10] tested and 0.1 was chosen as the best param
            'gb__max_depth': [3],               # [None, 1, 2, 3, 4] tested and 3 was chosen as the best param
            'gb__n_estimators': [100]}          # The only estimator

param_xgb = {'cvec__max_df': [0.9],             # [.9, .95] tested and 0.9 was chosen as the best param
            'cvec__max_features': [5000],       # [5_000, 10_000, 20_000, 30_000] tested and 5000 was chosen as the best param 
            'cvec__min_df': [2],                # [2, 3] tested and 2 was chosen as the best param
            'cvec__ngram_range': [(1, 2)],      # [(1, 1), (1, 2)] tested and (1, 2) was chosen as the best param
            'cvec__stop_words': ['english'],    # [None, "english"] tested and 'english' was chosen as the best param
            'cvec__tokenizer': [stem_words],    # [None, stem_words] tested and stem_words was chosen as the best param
            'xgb__learning_rate': [0.1],        # [0.1, 1, 10] tested and 0.1 was chosen as the best param
            'xgb__max_depth': [4],              # [None, 1, 2, 3, 4] tested and 4 was chosen as the best param
            'xgb__n_estimators': [100]}         # The only estimator

param_svc = {'svc__C': [100],                   # [0.1, 1, 10, 100] tested and 100 was chosen as the best param
            'svc__kernel': ['rbf'],             # 'rbf' and 'poly' (with "degree": [2, 3, 4]) tested and 'rbf' was chosen as the best param
            'tvec__max_df': [0.9],              # [.9, .95] tested and 0.95 was chosen as the best param
            'tvec__max_features': [20000],      # [5_000, 10_000, 20_000, 30_000] tested and 30000 was chosen as the best param 
            'tvec__min_df': [2],                # [2, 3] tested and 2 was chosen as the best param
            'tvec__ngram_range': [(1, 2)],      # [(1, 1), (1, 2)] tested and (1, 1) was chosen as the best param
            'tvec__stop_words': [None],         # [None, "english"] tested and 'english' was chosen as the best param
            'tvec__tokenizer': [None]}          # [None, stem_words] tested and None was chosen as the best param

# Make a list of pipes params
pipe_prarams = [param_lr, param_nb, param_rf, param_et, param_gb, param_xgb, param_svc]

# Make a list of models names
names = ["Logistic Regression Model", "Naïve Bayes Model", "Random Forest Model",
         "Extra Trees", "Gradient Boost", "XGBoost Model", "SVM Model"]

In [11]:
for pipe, pgrid, name in zip(pipelines, pipe_prarams, names):
    
    # Instantiate Gread Search
    gs = GridSearchCV(estimator=pipe, param_grid=pgrid, cv=5, n_jobs=4)
    
    # Fit the model
    gs.fit(X_train, y_train)
    
    # Predict y
    preds = gs.predict(X_test)
    
    # Confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    
    # Define the path where model should be saved
    model_path = f"../models/{name}.pkl"
    
    # Save the model
    with open(model_path, "wb") as pickle_out:
        pickle_out = pickle.dump(gs, pickle_out)
        
    # Evaluate the model
    print(f" {name} ".center(34, "="))
    print(f"Accuracy  ---------- {accuracy_score(y_test, preds)}")
    print(f"Precision  --------- {precision_score(y_test, preds)}")
    print(f"Sensitivity  ------- {recall_score(y_test, preds)}")
    print(f"Specifity  --------- {tn/(tn + fp)}")
    print(f"F1 Score ----------- {f1_score(y_test, preds)}\n")



=== Logistic Regression Model ====
Accuracy  ---------- 0.8960546282245827
Precision  --------- 0.8929159802306426
Sensitivity  ------- 0.8827361563517915
Specifity  --------- 0.9076704545454546
F1 Score ----------- 0.8877968877968878

Accuracy  ---------- 0.9066767830045523
Precision  --------- 0.8966074313408724
Sensitivity  ------- 0.9039087947882736
Specifity  --------- 0.9090909090909091
F1 Score ----------- 0.900243309002433

Accuracy  ---------- 0.8952959028831563
Precision  --------- 0.9146341463414634
Sensitivity  ------- 0.8550488599348535
Specifity  --------- 0.9303977272727273
F1 Score ----------- 0.8838383838383839

Accuracy  ---------- 0.8998482549317147
Precision  --------- 0.9126712328767124
Sensitivity  ------- 0.8680781758957655
Specifity  --------- 0.9275568181818182
F1 Score ----------- 0.8898163606010018

Accuracy  ---------- 0.8983308042488619
Precision  --------- 0.9166666666666666
Sensitivity  ------- 0.8599348534201955
Specifity  --------- 0.9318181818181818
F1



Accuracy  ---------- 0.8930197268588771
Precision  --------- 0.9042735042735043
Sensitivity  ------- 0.8615635179153095
Specifity  --------- 0.9204545454545454
F1 Score ----------- 0.8824020016680567

Accuracy  ---------- 0.9013657056145675
Precision  --------- 0.9172413793103448
Sensitivity  ------- 0.8664495114006515
Specifity  --------- 0.9318181818181818
F1 Score ----------- 0.8911222780569514

