In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, \
                            accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
                             GradientBoostingClassifier, AdaBoostClassifier, \
                             VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from nltk.corpus import stopwords

In [2]:
combined = pd.read_csv('../datasets/combined_clean.csv')

In [3]:
combined.head()

Unnamed: 0,is_headphones,post,post_clean
0,0,Right earbud on Tozo NC9s won’t connect They c...,right earbud tozo nc connect connected fine da...
1,0,Choosing The Best Wireless Earbud Headphones I...,choosing best wireless earbud headphone set go...
2,0,Bought a pair of Soundcore Liberty Air 2 and I...,bought pair soundcore liberty air issue right ...
3,0,Desperately looking for waterproof earbuds wit...,desperately waterproof earbuds physical button...
4,0,Mi basic 2 issue The button on the right ear d...,mi basic issue button right ear work turn paus...


In [73]:
X = combined['post']
y = combined['is_headphones']

In [74]:
y.value_counts(normalize = True)

0    0.5
1    0.5
Name: is_headphones, dtype: float64

In [75]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [76]:
display(X_train.shape)
display(y_test.shape)

(1472,)

(726,)

In [14]:
# Instantiate a CountVectorizer.
cvec = CountVectorizer()

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train = cvec.transform(X_train)

In [15]:
X_train.shape

(1472, 7898)

In [91]:
pipe_params = {
    # Setting a limit of n-number of features included/vocab size
    'cvec__max_features': [2000, 4000],

    # Setting a minimum number of times the word/token has to appear in n-documents
    'cvec__min_df':[2, 3, 4],
    
    # Setting an upper threshold/max percentage of n% of documents from corpus 
    'cvec__max_df': [0.2, 0.3, 0.4],
    
    # With stopwords
    'cvec__stop_words': ['english'],
    
    # Testing with unigrams and bigrams
    'cvec__ngram_range':[(1,1), (1,2)],
    
    # Trying different types of regularization
    'lr__penalty':['l2'],

     # Trying different alphas of: 10, 1, 0.1 (C = 1/alpha)
    'lr__C':[0.1, 1, 10]
}

In [92]:
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])



In [97]:
import warnings
warnings.filterwarnings("ignore")
# Instantiate GridSearchCV.

gs = GridSearchCV(pipe1, 
                  pipe_params, # what parameters values are we searching?
                  cv = 3, verbose = 1) # 5-fold cross-validation.
gs.fit(X_train, y_train)


Fitting 3 folds for each of 108 candidates, totalling 324 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr', LogisticRegression())]),
             param_grid={'cvec__max_df': [0.2, 0.3, 0.4],
                         'cvec__max_features': [2000, 4000],
                         'cvec__min_df': [2, 3, 4],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': ['english'], 'lr__C': [0.1, 1, 10],
                         'lr__penalty': ['l2']},
             verbose=1)

In [98]:
gs.best_params_

{'cvec__max_df': 0.3,
 'cvec__max_features': 2000,
 'cvec__min_df': 4,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'lr__C': 0.1,
 'lr__penalty': 'l2'}

In [28]:
tuning_list = []
pipe = Pipeline([
            ('cvec', CountVectorizer()),
            ('lr', LogisticRegression())
            ])
pipe.fit(X_train)
#gs = GridSearchCV(pipe, param_grid = {**cvec_params, **lr_params}, cv=5, verbose=1, n_jobs=-1)
#gs.fit(X_train, y_train)

AttributeError: lower not found

In [101]:
# Instantiate vectorizers
vectorizers = {'cvec': CountVectorizer(),
               'tvec': TfidfVectorizer()}

# Instiantiate models
models = {'lr': LogisticRegression(max_iter = 1_000, random_state = 42),
          'rf': RandomForestClassifier(random_state = 42),
          'gb': GradientBoostingClassifier(random_state = 42),
          'et': ExtraTreesClassifier(random_state = 42),
          'ada': AdaBoostClassifier(random_state = 42),
          'nb': MultinomialNB(),
          'svc': SVC(random_state = 42)}

stop_words = stopwords.words('english')
extra_words = ['like', 'one', 'x', 'would', 'get', 'really', 'use', 'https', 'good', 'also', 'better', 'know', 
              'new', 'looking', 'removed', 'jpg', 'width', 'format', 'pjpg', 'xx', 'www', 'reddit', 'redd', 
              'preview', 'webp', 'view', 'poll', 'com', 'png']
stop_words.extend(extra_words)

In [105]:
cvec_params = {
    # Setting a limit of n-number of features included/vocab size
    'cvec__max_features': [None, 12_000],

    # Setting a minimum number of times the word/token has to appear in n-documents
    'cvec__min_df':[2, 3, 4],
    
    # Setting an upper threshold/max percentage of n% of documents from corpus 
    'cvec__max_df': [0.2, 0.3, 0.4],
    
    # With stopwords
    'cvec__stop_words': [stop_words],
    
    # Testing with unigrams and bigrams
    'cvec__ngram_range':[(1,1), (1,2)]
}

tvec_params = {
    'tvec__max_features': [None],
    'tvec__min_df':[3, 4, 5],
    'tvec__max_df': [0.2, 0.3, 0.4],
    'tvec__stop_words': ['english'],
    'tvec__ngram_range':[(1,1), (1,2)]
}

lr_params = {
    # Trying different types of regularization
    #'lr__penalty':['l2'],

     # Trying different alphas of: 10, 1, 0.1 (C = 1/alpha)
    'lr__C':[0.1, 1, 10]
}

nb_params = {
    'nb__fit_prior': [True, False],
    'nb__alpha': [0, 0.4, 0.8]
}

svc_params = {
    'svc__C':[0.1, 1, 10],
    'svc__gamma':[0.01, 0.1, 0.3], 
    'svc__kernel':['linear','rbf']
}

In [106]:
# Function to run model -- input vectorizer and model
def run_model(vec, mod, vec_params={}, mod_params={}, grid_search=False):
    
    results = {}
    
    pipe = Pipeline([
            (vec, vectorizers[vec]),
            (mod, models[mod])
            ])
    
    if grid_search:
        gs = GridSearchCV(pipe, param_grid = {**vec_params, **mod_params}, cv=5, verbose=1, n_jobs=-1)
        gs.fit(X_train, y_train)
        pipe = gs
        
    else:
        pipe.fit(X_train, y_train)
    
    # Retrieve metrics
    results['model'] = mod
    results['vectorizer'] = vec
    results['train'] = pipe.score(X_train, y_train)
    results['test'] = pipe.score(X_test, y_test)
    predictions = pipe.predict(X_test)
    results['roc'] = roc_auc_score(y_test, predictions)
    results['precision'] = precision_score(y_test, predictions)
    results['recall'] = recall_score(y_test, predictions)
    results['f_score'] = f1_score(y_test, predictions)
    
    if grid_search:
        tuning_list.append(results)
        print('### BEST PARAMS ###')
        display(pipe.best_params_)
        
    else:
        eval_list.append(results)
    
    print('### METRICS ###')
    display(results)
    
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    
    return pipe

In [107]:
cvec_lr_gs = run_model('cvec', 'lr', vec_params=cvec_params, mod_params=lr_params, grid_search=True)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
### BEST PARAMS ###


{'cvec__max_df': 0.4,
 'cvec__max_features': None,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',

### METRICS ###


{'model': 'lr',
 'vectorizer': 'cvec',
 'train': 0.9952445652173914,
 'test': 0.8898071625344353,
 'roc': 0.8898071625344351,
 'precision': 0.8546365914786967,
 'recall': 0.9393939393939394,
 'f_score': 0.8950131233595799}

True Negatives: 305
False Positives: 58
False Negatives: 22
True Positives: 341
