## Imports

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [3]:
# Define dataframe path
df_path = "../data/final_dataset.csv"

In [4]:
# load data
df = pd.read_csv(df_path).drop(columns="Unnamed: 0")

# Get some basic information 
print(f"Sample Data :{df.head()}")
print(f"\n--------------------------\n\n Columns : {[i for i in df.columns]}")
print(f"\n--------------------------\n\n Size of the dataset : {df.shape[0]}")
print(f"\n--------------------------\n\n Total number of columns : {df.shape[1]}")

Sample Data :            author                                        description  \
0          TuaAnon  yes, it's an lte watch with data turned on, no...   
1  DemandScary1934  how accurate is the active/total calorie track...   
2         Vinumite  fell very hard blackout drunk and lost watch. ...   
3          Damarou  pls tell me i‘m not the only one who has such ...   
4       ThorNike13  deleted apps in watch app on iphone with weird...   

    subreddit  des_word_count  
0  AppleWatch              48  
1  AppleWatch              10  
2  AppleWatch              59  
3  AppleWatch              15  
4  AppleWatch              15  

--------------------------

 Columns : ['author', 'description', 'subreddit', 'des_word_count']

--------------------------

 Size of the dataset : 5270

--------------------------

 Total number of columns : 4


In [5]:
# Drop unusefull columns
df.drop(columns =["author", "des_word_count"], inplace=True)

In [6]:
# Create label column (target column)
df["subreddit"] = df["subreddit"].map({"GalaxyWatch": 1, "AppleWatch":0})

## Functions

In [7]:
# cite: Got help form Katie Sylvia
# a function for stemming
def stem_words(text):
    # Initialize the Porter stemmer
    stemmer = PorterStemmer()
    
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Stem each word and join them back into a string
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = " ".join(stemmed_words)
    
    return stemmed_text

In [8]:
# Grid search over with pipeline estimator
def gridsearching(estimator, param_grid, model_name, transformer_name):
    
    print(f" Summary of {model_name} Model with {transformer_name} Transformer Evaluation ".center(115, "="))
    
    # Instantiate a GridSearch model
    gs = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, n_jobs=5)

    # Fit the model to traning data
    gs.fit(X_train, y_train)

    # Get to know wich params were the best ones
    print(" The Best Params ".center(34, "="))
    print(gs.best_params_)
    print()

    # Get to know the best score
    print(" The Best Score ".center(34, "="))
    print(gs.best_score_)
    print()

    # Train data score
    print(" Train Score ".center(34, "="))
    print(gs.score(X_train, y_train))
    print()

    # Test data score
    print(" Test Score ".center(34, "="))
    print(gs.score(X_test, y_test))
    
    # Get predictions
    preds = gs.predict(X_test)

    # Confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    # Confusion matrix
    plot_confusion_matrix(gs, X_test, y_test, cmap="Purples", values_format="d")

    plt.title(f"The Confusion Matrix of {model_name} with {transformer_name}");

    # Evaluate a model
    print(f" Evaluation Metrics ".center(34, "="))
    print(f"Accuracy  ---------- {accuracy_score(y_test, preds)}")
    print(f"Precision  --------- {precision_score(y_test, preds)}")
    print(f"Sensitivity  ------- {recall_score(y_test, preds)}")
    print(f"Specifity  --------- {tn/(tn + fp)}")

## Baselin Score

In [9]:
# Define X (features) and y(target)
X = df["description"]
y = df["subreddit"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
print(f"X shape ---------- {X.shape}")
print(f"y shape ---------- {y.shape}")

X shape ---------- (5270,)
y shape ---------- (5270,)


### Baseline accuracy

In [10]:
# Get the ratio of classes
y_test.value_counts(normalize=True)

0    0.534143
1    0.465857
Name: subreddit, dtype: float64

**Baseline accuracy score is 0.53 which is the majority class**

## AdaBoost Model

In [13]:
# Instantiate a high biased Decision Tree
tree = DecisionTreeClassifier(max_depth=1, random_state=111)

### AdaBoost with CountVectorizer

In [14]:
# Instantiate the pipeline with Logistic Regression and CountVectorizer transformer
pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("ada", AdaBoostClassifier(base_estimator=tree, algorithm='SAMME.R'))
])

In [16]:
# Get pipeline params
pipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('ada',
   AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                            random_state=111)))],
 'verbose': False,
 'cvec': CountVectorizer(),
 'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                          random_state=111)),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'ada__algorithm': 'SAMME.R',
 'ada__base_estimator__ccp_alpha': 0.0,
 'ada__base_estimator__cla

In [None]:
# Set pipeline params by cvec
pipe_params = {
    "cvec__tokenizer": [None, stem_words],
    "cvec__max_features": [5_000, 10_000, 20_000, 30_000],
    "cvec__min_df": [2, 3],
    "cvec__max_df": [.9, .95],
    "cvec__stop_words": [None, "english"],
    "cvec__ngram_range": [(1, 1), (1, 2)],
    "ada__learning_rate": [0.1, 1, 10]
    "ada__n_estimators": [100]
}

In [None]:
# Get the result of the model with defined parameters in GreadSearch
gridsearching(estimator=pipe, param_grid=pipe_params, model_name="AdaBoost", transformer_name="CountVectorizer")

### AdaBoost with TfidfVectorizer

In [25]:
# Instantiate the pipeline with Logistic Regression and CountVectorizer transformer
pipe = Pipeline([
    ("tvec", TfidfVectorizer()),
    ("ada", AdaBoostClassifier(base_estimator=tree, algorithm='SAMME.R'))
])

In [26]:
# Get pipeline params
pipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()),
  ('ada',
   AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                            random_state=111)))],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                          random_state=111)),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary'

In [27]:
# Set pipeline params by cvec
pipe_params = {
    "tvec__tokenizer": [None, stem_words],
    "tvec__max_features": [5_000, 10_000, 20_000, 30_000],
    "tvec__min_df": [2, 3],
    "tvec__max_df": [.9, .95],
    "tvec__stop_words": [None, "english"],
    "tvec__ngram_range": [(1, 1), (1, 2)],
    "ada__learning_rate": [0.1, 1, 10],
    "ada__n_estimators": [100]
}

In [None]:
# Get the result of the model with defined parameters in GreadSearch
gridsearching(estimator=pipe, param_grid=pipe_params, model_name="AdaBoost", transformer_name="TfidfVectorizer")

### GradientBoost with CountVectorizer

In [31]:
# Instantiate the pipeline with Logistic Regression and CountVectorizer transformer
pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("gb", GradientBoostingClassifier(random_state=111))
])

In [32]:
# Get pipeline params
pipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('gb', GradientBoostingClassifier(random_state=111))],
 'verbose': False,
 'cvec': CountVectorizer(),
 'gb': GradientBoostingClassifier(random_state=111),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'gb__ccp_alpha': 0.0,
 'gb__criterion': 'friedman_mse',
 'gb__init': None,
 'gb__learning_rate': 0.1,
 'gb__loss': 'deviance',
 'gb__max_depth': 3,
 'gb__max_features': None,
 'gb__max_leaf_nodes': None,
 'gb__min_impurity_decrease': 0.0,
 'gb__min_samples_leaf': 1,
 'gb__min_samples_split': 2,
 'gb__min_

In [33]:
# Set pipeline params by cvec
pipe_params = {
    "cvec__tokenizer": [None, stem_words],
    "cvec__max_features": [5_000, 10_000, 20_000, 30_000],
    "cvec__min_df": [2, 3],
    "cvec__max_df": [.9, .95],
    "cvec__stop_words": [None, "english"],
    "tvec__ngram_range": [(1, 1), (1, 2)],
    "gb__learning_rate": [0.1, 1, 10],
    "gb__max_depth": [None, 1, 2, 3, 4],
    "gb__n_estimators": [100]
}

In [None]:
# Get the result of the model with defined parameters in GreadSearch
gridsearching(estimator=pipe, param_grid=pipe_params, model_name="GradientBoost", transformer_name="CountVectorizer")

### GradientBoost with TfidfVectorizer

In [20]:
# Instantiate the pipeline with Logistic Regression and CountVectorizer transformer
pipe = Pipeline([
    ("tvec", TfidfVectorizer()),
    ("gb", GradientBoostingClassifier(random_state=111))
])

In [21]:
# Get pipeline params
pipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()),
  ('gb', GradientBoostingClassifier(random_state=111))],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'gb': GradientBoostingClassifier(random_state=111),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'gb__ccp_alpha': 0.0,
 'gb__criterion': 'friedman_mse',
 'gb__init': None,
 'gb__learning_rate': 0.1,
 'gb__loss': 'deviance',
 'gb__max_depth': 3,
 'gb__max_features': None,
 'gb__max_leaf_nodes': None

In [24]:
# Set pipeline params by cvec
pipe_params = {
    "tvec__tokenizer": [None, stem_words],
    "tvec__max_features": [5_000, 10_000, 20_000, 30_000],
    "tvec__min_df": [2, 3],
    "tvec__max_df": [.9, .95],
    "tvec__stop_words": [None, "english"],
    "tvec__ngram_range": [(1, 1), (1, 2)],
    "gb__learning_rate": [0.1, 1, 10],
    "gb__max_depth": [None, 1, 2, 3, 4],
    "gb__n_estimators": [100]
}

In [None]:
# Get the result of the model with defined parameters in GreadSearch
gridsearching(estimator=pipe, param_grid=pipe_params, model_name="GradientBoost", transformer_name="TfidfVectorizer")