In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

In [2]:
snowski = pd.read_csv('../data/Clean/snow_ski2.csv')
snowski.head()

Unnamed: 0,created_utc,author,score,upvote_ratio,num_comments,subreddit,text,lem_text,stem_text,post_length,post_word_count
0,1686844000.0,bas1cred,26,0.91,19,snowboardingnoobs,my first board. ready for the upcoming season!!,my first board ready for the upcoming season,my first board readi for the upcom season,48,8
1,1686786000.0,Pooffios,3,1.0,12,snowboardingnoobs,"outer side foot pain hi all, i was hoping to g...",outer side foot pain hi all i wa hoping to get...,outer side foot pain hi all i wa hope to get s...,301,60
2,1686781000.0,trips69420,2,0.75,15,snowboardingnoobs,first board? looking to probably grab this as ...,first board looking to probably grab this a my...,first board look to probabl grab thi as my fir...,274,53
3,1686779000.0,twinbee,11,0.87,0,snowboardingnoobs,i did a front-side 180 today! more than one in...,i did a front side 180 today more than one in ...,i did a front side 180 today more than one in ...,1565,289
4,1686767000.0,Madden_Stephen,2,0.67,24,snowboardingnoobs,step-on boot recommendation? i’ve done a decen...,step on boot recommendation i ve done a decent...,step on boot recommend i ve done a decent bit ...,686,122


In [3]:
snowski.subreddit.value_counts(normalize=True)

skiing               0.51242
snowboardingnoobs    0.48758
Name: subreddit, dtype: float64

In [4]:
snowski['subreddit'] = snowski['subreddit'].map({'snowboardingnoobs': 1, 'skiing': 0})
snowski.head()

Unnamed: 0,created_utc,author,score,upvote_ratio,num_comments,subreddit,text,lem_text,stem_text,post_length,post_word_count
0,1686844000.0,bas1cred,26,0.91,19,1,my first board. ready for the upcoming season!!,my first board ready for the upcoming season,my first board readi for the upcom season,48,8
1,1686786000.0,Pooffios,3,1.0,12,1,"outer side foot pain hi all, i was hoping to g...",outer side foot pain hi all i wa hoping to get...,outer side foot pain hi all i wa hope to get s...,301,60
2,1686781000.0,trips69420,2,0.75,15,1,first board? looking to probably grab this as ...,first board looking to probably grab this a my...,first board look to probabl grab thi as my fir...,274,53
3,1686779000.0,twinbee,11,0.87,0,1,i did a front-side 180 today! more than one in...,i did a front side 180 today more than one in ...,i did a front side 180 today more than one in ...,1565,289
4,1686767000.0,Madden_Stephen,2,0.67,24,1,step-on boot recommendation? i’ve done a decen...,step on boot recommendation i ve done a decent...,step on boot recommend i ve done a decent bit ...,686,122


In [5]:
# Function for Lemmatizing
def lemmatize_txt(text):
    
    tokenizer = RegexpTokenizer('\w+')
    split_txt = tokenizer.tokenize(text)

    # Instantiate lemmatizer
    lemmatizer = WordNetLemmatizer()
        
    # Lemmatize and Rejoin
    return ' '.join([lemmatizer.lemmatize(word) for word in split_txt])

In [6]:
# Function for Stemming
def stem_txt(text):
    
    tokenizer = RegexpTokenizer('\w+')
    split_txt = tokenizer.tokenize(text)

    # Instantiate Stemmer
    p_stemmer = PorterStemmer()

    # Stem and Rejoin
    return ' '.join([p_stemmer.stem(word) for word in split_txt])

In [7]:
X = snowski['text']
y = snowski['subreddit']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1527) # Jokic and Jamal

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4227,)
(1409,)
(4227,)
(1409,)


In [22]:
# Build Models via Pipelines

In [12]:
pipe_log = Pipeline([
                 ('vec', None),
                 ('logr', LogisticRegression(solver = 'liblinear'))])

In [15]:
pipe_log.get_params()

{'memory': None,
 'steps': [('vec', None), ('logr', LogisticRegression())],
 'verbose': False,
 'vec': None,
 'logr': LogisticRegression(),
 'logr__C': 1.0,
 'logr__class_weight': None,
 'logr__dual': False,
 'logr__fit_intercept': True,
 'logr__intercept_scaling': 1,
 'logr__l1_ratio': None,
 'logr__max_iter': 100,
 'logr__multi_class': 'auto',
 'logr__n_jobs': None,
 'logr__penalty': 'l2',
 'logr__random_state': None,
 'logr__solver': 'lbfgs',
 'logr__tol': 0.0001,
 'logr__verbose': 0,
 'logr__warm_start': False}

In [18]:
# Look at params for CVEC
cvec_params = Pipeline([('cvec', CountVectorizer())])
cvec_params.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer())],
 'verbose': False,
 'cvec': CountVectorizer(),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None}

In [20]:
# Look at params for TVEC
tvec_params = Pipeline([('tvec', TfidfVectorizer())])
tvec_params.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None}

In [22]:
pgrid =[
    {
    'vec': [CountVectorizer()],
    'cvec__stop_words': [None, 'english'],
    'cvec__max_features': [100, 200, 300], 
    'cvec__preprocessor': [None, lemmatize_txt, stem_txt]
    },
    {
    'vec': [TfidfVectorizer()],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_features': [100, 200, 300], 
    'tvec__preprocessor': [None, lemmatize_txt, stem_txt]
    },
    {
    'logr__C': [np.linspace(.01,3,5)],
    'logr__penalty': ['l1', 'l2']
    }
]  

In [None]:
%%time
gs = GridSearchCV(pipe_log, pgrid, cv=5)
gs.fit(X_train, y_train)

In [None]:
print('Train:', gs.score(X_train, y_train))
print('Test:', gs.score(X_test, y_test))
print('Best Params: ', gs.best_params_)