## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
# Define dataframe path
df_path = "../data/final_dataset.csv"

In [None]:
# load data
df = pd.read_csv(df_path).drop(columns="Unnamed: 0")

# Get some basic information 
print(f"Sample Data :{df.head()}")
print(f"\n--------------------------\n\n Columns : {[i for i in df.columns]}")
print(f"\n--------------------------\n\n Size of the dataset : {df.shape[0]}")
print(f"\n--------------------------\n\n Total number of columns : {df.shape[1]}")

In [None]:
# Drop unusefull columns
df.drop(columns =["author", "des_word_count"], inplace=True)

In [None]:
# Create label column (target column)
df["subreddit"] = df["subreddit"].map({"GalaxyWatch": 1, "AppleWatch":0})

## Functions

In [None]:
# cite: Got help form Katie Sylvia
# a function for stemming
def stem_words(text):
    # Initialize the Porter stemmer
    stemmer = PorterStemmer()
    
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Stem each word and join them back into a string
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = " ".join(stemmed_words)
    
    return stemmed_text

In [None]:
# Grid search over with pipeline estimator
def gridsearching(estimator, param_grid, model_name, transformer_name):
    
    print(f" Summary of {model_name} Model with {transformer_name} Transformer Evaluation ".center(115, "="))
    
    # Instantiate a GridSearch model
    gs = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, n_jobs=5)

    # Fit the model to traning data
    gs.fit(X_train, y_train)

    # Get to know wich params were the best ones
    print(" The Best Params ".center(34, "="))
    print(gs.best_params_)
    print()

    # Get to know the best score
    print(" The Best Score ".center(34, "="))
    print(gs.best_score_)
    print()

    # Train data score
    print(" Train Score ".center(34, "="))
    print(gs.score(X_train, y_train))
    print()

    # Test data score
    print(" Test Score ".center(34, "="))
    print(gs.score(X_test, y_test))
    
    # Get predictions
    preds = gs.predict(X_test)

    # Confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    # Confusion matrix
    plot_confusion_matrix(gs, X_test, y_test, cmap="Purples", values_format="d")

    plt.title(f"The Confusion Matrix of {model_name} with {transformer_name}");

    # Evaluate a model
    print(f" Evaluation Metrics ".center(34, "="))
    print(f"Accuracy  ---------- {accuracy_score(y_test, preds)}")
    print(f"Precision  --------- {precision_score(y_test, preds)}")
    print(f"Sensitivity  ------- {recall_score(y_test, preds)}")
    print(f"Specifity  --------- {tn/(tn + fp)}")

In [None]:
# Cite: Got that function from GA 603-lesson_random_forest (Katie Sylvia)
def plot_feature_importance(importance, names, model_type):


    # Create a DataFrame using a Dictionary
    df = pd.DataFrame({"feature_names":names,
                       "feature_importance":importance})

    # Sort the DataFrame in order decreasing feature importance
    df.sort_values(by=["feature_importance"], ascending=False,inplace=True)
    top_features = df.head(20)

    # Define size of bar plot
    plt.figure(figsize=(10,8))
    
    # Generate a color palette based on the volume of feature importances
    palette = sns.color_palette("Purples", len(top_features))
    
    # Reverse the color palette
    reversed_palette = palette[::-1]
    
    # Plot Searborn bar chart
    sns.barplot(x=top_features["feature_importance"], y=top_features["feature_names"], palette=reversed_palette)
    
    # Add chart labels
    plt.title(model_type +  " Feature Importance", fontsize=14)
    plt.xlabel("Feature Importance", fontsize=12)
    plt.ylabel("Feature Names", fontsize=12)

## Modeling

In [None]:
# Define X (features) and y(target)
X = df["description"]
y = df["subreddit"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
print(f"X shape ---------- {X.shape}")
print(f"y shape ---------- {y.shape}")

### Baseline accuracy

In [None]:
# Get the ratio of classes
y_test.value_counts(normalize=True)

**Baseline accuracy score is 0.53 which is the majority class**

## Logistic Regression

### Logistic Regression with CountVectorizer

In [None]:
# Instantiate the pipeline with Logistic Regression and CountVectorizer transformer
pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("lr", LogisticRegression())
])

In [None]:
# Get pipeline params
pipe.get_params()

In [None]:
# Set pipeline params by cvec
pipe_params = {
    "cvec__tokenizer": [None, stem_words],
    "cvec__max_features": [5_000, 10_000, 20_000, 30_000],
    "cvec__min_df": [2, 3],
    "cvec__max_df": [.9, .95],
    "cvec__stop_words": [None, "english"],
    "cvec__ngram_range": [(1, 1), (1, 2)],
    "lr__penalty": ["l1", "l2"],
    "lr__C": np.logspace(-3, 0, 1000)
}

In [None]:
# Get the result of the model with defined parameters in GreadSearch
gridsearching(estimator=pipe, param_grid=pipe_params, model_name="Logistic Regression", transformer_name="CountVectorizer")

### Logistic Regression with TfidVectorizer

In [None]:
# Instantiate the pipeline with Logistic Regression and CountVectorizer transformer
pipe = Pipeline([
    ("tvec", TfidfVectorizer()),
    ("lr", LogisticRegression())
])

In [None]:
# Get pipeline params
pipe.get_params()

In [None]:
# Set pipeline params by cvec
pipe_params = {
    "tvec__tokenizer": [None, stem_words],
    "tvec__max_features": [5_000, 10_000, 20_000, 30_000],
    "tvec__min_df": [2, 3],
    "tvec__max_df": [.9, .95],
    "tvec__stop_words": [None, "english"],
    "cvec__ngram_range": [(1, 1), (1, 2)],
    "lr__penalty": ["l1", "l2"],
    "lr__C": np.logspace(-3, 0, 1000)
}

In [None]:
# Get the result of the model with defined parameters in GreadSearch
gridsearching(estimator=pipe, param_grid=pipe_params, model_name="Logistic Regression", transformer_name="TfidVectorizer")