In [1]:
import pandas as pd
import regex as re
import nltk


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

pd.set_option('max_colwidth', 100000)

In [2]:
www = pd.read_csv('./www_clean.csv')
nomil = pd.read_csv('./nomil_clean.csv')

In [3]:
result = pd.merge(www, nomil,how='outer')

In [4]:
result.count()

selftext     1900
subreddit    1900
dtype: int64

def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].str.replace(r"\(https?:\/\/.*[\r\n]*", " ")
    df[text_field] = df[text_field].str.replace(r"[^\w\s]", " ")
    df[text_field] = df[text_field].str.replace(r"[\n]", " ")
    df[text_field] = df[text_field].str.replace(r"[\d+]", " ")
    #df[text_field] = df[text_field].str.replace(r"http\S+", "")
    #df[text_field] = df[text_field].str.replace(r"http", "")
    #df[text_field] = df[text_field].str.replace(r"@\S+", "")
    #df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    #df[text_field] = df[text_field].str.replace(r"@", "at")
    return df

In [5]:
def clean_text(text):
    # remove HTML tags and URLs
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*','',text)
    
    # keep only text without punctuation
    text = re.sub(r'[^\w\s]','',text)
    
    # convert text to lowercase
    text = text.strip().lower()

    # split text into a list of words
    token_text = re.split('\W+',text) #W+ --> word chars and dashes permitted
    
    return token_text

In [6]:
#result = clean_text(result['selftext'])
#result.selftext.head(1)

result['token'] = result['selftext'].apply(lambda x: clean_text(x))
result.head(1)

Unnamed: 0,selftext,subreddit,token
0,"What's your favorite feat of all time in all of fiction, ***and why***?\n\nJumping-off point aside, new Featured Character/Team sign-ups should be coming soon (this week easily), any other pieces of important business people are dying to ask about, feel free to do so here!\n\nBeyond that, business as usual, use this thread to discuss and-and-everything!!",whowouldwin,"[whats, your, favorite, feat, of, all, time, in, all, of, fiction, and, why, jumpingoff, point, aside, new, featured, characterteam, signups, should, be, coming, soon, this, week, easily, any, other, pieces, of, important, business, people, are, dying, to, ask, about, feel, free, to, do, so, here, beyond, that, business, as, usual, use, this, thread, to, discuss, andandeverything]"


In [7]:
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
result['token_stop'] = result['token'].apply(lambda x: [item for item in x if item not in stop])
result.head(1)

Unnamed: 0,selftext,subreddit,token,token_stop
0,"What's your favorite feat of all time in all of fiction, ***and why***?\n\nJumping-off point aside, new Featured Character/Team sign-ups should be coming soon (this week easily), any other pieces of important business people are dying to ask about, feel free to do so here!\n\nBeyond that, business as usual, use this thread to discuss and-and-everything!!",whowouldwin,"[whats, your, favorite, feat, of, all, time, in, all, of, fiction, and, why, jumpingoff, point, aside, new, featured, characterteam, signups, should, be, coming, soon, this, week, easily, any, other, pieces, of, important, business, people, are, dying, to, ask, about, feel, free, to, do, so, here, beyond, that, business, as, usual, use, this, thread, to, discuss, andandeverything]","[whats, favorite, feat, time, fiction, jumpingoff, point, aside, new, featured, characterteam, signups, coming, soon, week, easily, pieces, important, business, people, dying, ask, feel, free, beyond, business, usual, use, thread, discuss, andandeverything]"


In [9]:
result.columns
feature = ['token_stop', 'subreddit']
result = result[feature]
result.head(3)

Unnamed: 0,token_stop,subreddit
0,"[whats, favorite, feat, time, fiction, jumpingoff, point, aside, new, featured, characterteam, signups, coming, soon, week, easily, pieces, important, business, people, dying, ask, feel, free, beyond, business, usual, use, thread, discuss, andandeverything]",whowouldwin
1,"[read, whole, post, signing, every, week, subreddit, features, character, every, two, weeks, feature, team, posts, written, users, charactersteams, feel, like, shown, love, subreddit, posts, question, sort, like, shorter, respect, threads, sign, post, signups, july, september, session, stay, around, week, give, take, therefore, signups, close, 1159, pm, est, monday, june, 17th, give, take, sign, easy, worries, post, listing, either, character, team, youre, planning, point, signups, close, send, us, proof, concept, modmail, proof, concept, anything, draft, thread, already, done, respect, thread, imgur, album, feats, long, know, something, ready, going, vetting, past, post, doesnt, meet, standards, info, rules, section, work, get, ready, submission, ...]",whowouldwin
2,"[latest, 616, rebirth, versions, unless, stated, otherwise, arent, 100, evil, aligned, villains, scenario, one, stated, title, side, wins, rounds, overall, scenario, two, ladder, mode, whoever, wins, given, round, allowed, pass, next, rung, help, character, universe, get, 24, hours, rest, rounds, prep, unless, round, theyre, entering, specifically, allows, prep, time, ie, theyll, recuperate, hospital, setting, whatever, cosmic, equivalent, without, knowing, theyre, fight, change, okay, matches, match, one, green, goblin, vs, joker, 120, hours, psychologically, break, opponents, main, enemy, batman, spiderman, respectively, may, kill, anybody, main, target, superheroes, help, bruce, peter, match, two, kingpin, vs, penguin, kingpin, placed, gotham, penguin, placed, new, ...]",whowouldwin


In [10]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    lemma_words = [lemmatizer.lemmatize(word) for word in text]
    return lemma_words

In [11]:
result['token_stop1'] = result['token_stop'].apply(lambda x: lemmatize_words(x))

In [12]:
result.head(1)

Unnamed: 0,token_stop,subreddit,token_stop1
0,"[whats, favorite, feat, time, fiction, jumpingoff, point, aside, new, featured, characterteam, signups, coming, soon, week, easily, pieces, important, business, people, dying, ask, feel, free, beyond, business, usual, use, thread, discuss, andandeverything]",whowouldwin,"[whats, favorite, feat, time, fiction, jumpingoff, point, aside, new, featured, characterteam, signups, coming, soon, week, easily, piece, important, business, people, dying, ask, feel, free, beyond, business, usual, use, thread, discus, andandeverything]"


In [13]:
result['token_stop1'] = result['token_stop1'].apply(lambda x: ",".join(x))

In [14]:
result.head(1)

Unnamed: 0,token_stop,subreddit,token_stop1
0,"[whats, favorite, feat, time, fiction, jumpingoff, point, aside, new, featured, characterteam, signups, coming, soon, week, easily, pieces, important, business, people, dying, ask, feel, free, beyond, business, usual, use, thread, discuss, andandeverything]",whowouldwin,"whats,favorite,feat,time,fiction,jumpingoff,point,aside,new,featured,characterteam,signups,coming,soon,week,easily,piece,important,business,people,dying,ask,feel,free,beyond,business,usual,use,thread,discus,andandeverything"


In [15]:
feature = ['token_stop1', 'subreddit']
result = result[feature]
result.head(1)

Unnamed: 0,token_stop1,subreddit
0,"whats,favorite,feat,time,fiction,jumpingoff,point,aside,new,featured,characterteam,signups,coming,soon,week,easily,piece,important,business,people,dying,ask,feel,free,beyond,business,usual,use,thread,discus,andandeverything",whowouldwin


In [16]:
#Baseline
result['subreddit'].value_counts(normalize=True)

JUSTNOMIL      0.5
whowouldwin    0.5
Name: subreddit, dtype: float64

In [17]:
#Assignment of target
X = result['token_stop1']
y = result['subreddit']

#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#Creation of pipeline
pipe = Pipeline([('cv', CountVectorizer()),
                 ('tfidf',TfidfTransformer()),
                 ('lr', LogisticRegression())])
#lr = LogisticRegression()
# Evaluate how your model will perform on unseen data
#cross_val_score(lr, X_train, y_train, cv=3)

# Fit your model
pipe.fit(X_train, y_train)

# Training score
print(pipe.score(X_train, y_train))

# Test score
print(pipe.score(X_test, y_test))


In [None]:
pipe_params = {
    'cv__max_features': [2500, 3000, 3500],
    'cv__min_df': [2, 3],
    'cv__max_df': [.9, .95],
    'cv__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [18]:
## Multinomial Naive Bayes model

# set up pipeline

# initialize


pipe = Pipeline([('cv', CountVectorizer()),
                 ('mnb', MultinomialNB())])

In [19]:
# baseline MultinomialNB model

pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

train score: 0.9929824561403509
test score: 0.9873684210526316


In [23]:
%%time

# gridsearchCV tests cross-validation for the parameters

params = { 
#     'cvec__ngram_range': [(1, 1), (1, 2)]
    'cv__max_features': [2500, 3000, 3500], 
    'cv__ngram_range': [(1, 1),(1,2)], 
    'mnb__alpha': [0, 0.25, 0.5, 0.75, 1]
    # min_df
    # max_df
}
gs = GridSearchCV(pipe, param_grid=params, cv=3, verbose=1)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   46.4s finished


0.983859649122807
{'cv__max_features': 3500, 'cv__ngram_range': (1, 2), 'mnb__alpha': 1}
CPU times: user 46.8 s, sys: 557 ms, total: 47.4 s
Wall time: 47.4 s
