# Time to model!

In [1]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

**Read in the combined Nike and Adidas csv to begin modeling**

In [3]:
df = pd.read_csv('../dataNike_Adidas_combined.csv')

In [4]:
# Check dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,comments,created,subscribers,text,title,url,subreddit
0,0,1,2407,11031,Please note that any product from a previous s...,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://www.reddit.com/r/Nike/comments/47fex4/...,1
1,1,6,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/kop0xjccq1521.jpg,1
2,2,0,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/j4oaab0za2521.jpg,1
3,3,7,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/2bk8fp7rnw421.jpg,1
4,4,0,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/6kqs52x3xy421.jpg,1


In [5]:
# Drop Unnamed: 0 column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
# Drop was successful
df.head()

Unnamed: 0,comments,created,subscribers,text,title,url,subreddit
0,1,2407,11031,Please note that any product from a previous s...,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://www.reddit.com/r/Nike/comments/47fex4/...,1
1,6,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/kop0xjccq1521.jpg,1
2,0,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/j4oaab0za2521.jpg,1
3,7,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/2bk8fp7rnw421.jpg,1
4,0,2554,11031,0,"tip for identifi nike product nike ids, 🔥or🗑? ...",https://i.redd.it/6kqs52x3xy421.jpg,1


## Set X and y

In [7]:
X = df['title']
y = df['subreddit']

## Baseline Score

**The baseline score is the score of the majority class. Since the majority class is Nike, the baseline score is about `.502`**

In [8]:
y.value_counts(normalize=True)

1    0.501758
0    0.498242
Name: subreddit, dtype: float64

## Train Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [65]:
X_train[1800]

'tip for identifi nike product nike ids, 🔥or🗑? 3D print jordan 1s by me hardshel backpack / drone camera bag info please!! just bought thi at goodwil and look for more info if available. anoth redditor ask me too find the name of thi tracksuit, do ani of you guy have ani idea what the name might be? can I get an ID on these? jordan concord 11 doe anyon know how legit thi jacket is? doe anyon know thi style of sweatshirt? can’t find it anywher anyon know whi nikeid wont let me put iniesta on the back of a shirt? are these still somewhat available? nike airmax 270 volt $56 J3 black cements. good deal? saw these at the shop today. never seen them before. anyon know what theyr called? how long doe it usual take to get your money back when returning? look at thi rad shoe I made ben simmons\' nike hyperdunk x PE got these for my bday. anyon know ehat colorway these are? can someon ID thi vintag nike hoodie? 123 af1 vs air max 1 vs aj1? full nike utilitarian set up what shoe do they have at t

## CountVectorize our data

In [10]:
cv = CountVectorizer(stop_words='english',      # Instantiate english stop words
                    lowercase=True,             # All words to lowercase if not already
                    min_df=5,                   # Ignore words that do not occur at least 5 times
                    ngram_range=(1,4))          # Set ngram_range

In [19]:
cv_train = cv.fit_transform(X_train)            # Fit the CountVectorizer to the training data
cv_test = cv.transform(X_test)                  # Fit the CountVectorizer to the testing data

In [22]:
# Create cv_train dataframe
cv_train_df = pd.DataFrame(cv_train.toarray(), columns = cv.get_feature_names())
cv_train_df.head()

Unnamed: 0,00,00 adida,00 adida origin,00 adida origin instagram,00 tax,00 tax includ,00 tax includ help,00 thoughts,00 thoughts way,00 thoughts way purpl,...,zx 500 rm boost,zx 500 rm size,zx flux,zx flux some1,zx flux some1 help,zx flux xeno,zx flux xeno worn,zx need,zx need help,zx need help id
0,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
1,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
2,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
3,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
4,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1


In [23]:
# Create cv_test dataframe
cv_test_df = pd.DataFrame(cv_test.toarray(), columns = cv.get_feature_names())
cv_test_df.head()

Unnamed: 0,00,00 adida,00 adida origin,00 adida origin instagram,00 tax,00 tax includ,00 tax includ help,00 thoughts,00 thoughts way,00 thoughts way purpl,...,zx 500 rm boost,zx 500 rm size,zx flux,zx flux some1,zx flux some1 help,zx flux xeno,zx flux xeno worn,zx need,zx need help,zx need help id
0,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
1,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
2,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
3,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
4,3,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1


In [24]:
print(cv_train.shape)
print(cv_test.shape)

(1493, 32542)
(498, 32542)


### Let's try different models and then optimize the one that works best

### Logistic Regression Model

In [31]:
lr = LogisticRegression(penalty='l1', tol=.0001, C=1, random_state=42)

lr.fit(cv_train, y_train)

print(lr.score(cv_train, y_train))
print('')
print(lr.score(cv_test, y_test))

0.5016744809109176

0.5020080321285141


In [61]:
pd.Series(lr.coef_[0]).sort_values()

0        0.000000e+00
21701    0.000000e+00
21700    0.000000e+00
21699    0.000000e+00
21698    0.000000e+00
21697    0.000000e+00
21696    0.000000e+00
21695    0.000000e+00
21694    0.000000e+00
21693    0.000000e+00
21692    0.000000e+00
21691    0.000000e+00
21690    0.000000e+00
21689    0.000000e+00
21688    0.000000e+00
21687    0.000000e+00
21686    0.000000e+00
21685    0.000000e+00
21684    0.000000e+00
21683    0.000000e+00
21682    0.000000e+00
21681    0.000000e+00
21680    0.000000e+00
21679    0.000000e+00
21678    0.000000e+00
21677    0.000000e+00
21676    0.000000e+00
21675    0.000000e+00
21702    0.000000e+00
21703    0.000000e+00
             ...     
10854    0.000000e+00
10853    0.000000e+00
10852    0.000000e+00
10851    0.000000e+00
10837    0.000000e+00
10850    0.000000e+00
10860    0.000000e+00
10848    0.000000e+00
10849    0.000000e+00
10847    0.000000e+00
10838    0.000000e+00
10839    0.000000e+00
10840    0.000000e+00
10842    0.000000e+00
10843    0

### Multinomial Naive Bayes Model

In [32]:
mnb = MultinomialNB()

mnb.fit(cv_train, y_train)

print(mnb.score(cv_train, y_train))
print('')
print(mnb.score(cv_test, y_test))

0.5016744809109176

0.5020080321285141


### Decision Tree Model

In [39]:
dt = DecisionTreeClassifier(criterion='gini', random_state=42, min_impurity_decrease=2, max_depth=100)

dt.fit(cv_train, y_train)

print(dt.score(cv_train, y_train))
print('')
print(dt.score(cv_test, y_test))

0.5016744809109176

0.5020080321285141


### Bagged Tree Model

In [40]:
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50)

bag.fit(cv_train, y_train)

print(bag.score(cv_train, y_train))
print('')
print(bag.score(cv_test, y_test))

0.5016744809109176

0.5020080321285141


### Extra Trees Model

In [41]:
et = ExtraTreesClassifier(max_depth=100, min_impurity_decrease=2, 
                          random_state=42, criterion='entropy', n_estimators=100)

et.fit(cv_train, y_train)

print(et.score(cv_train, y_train))
print('')
print(et.score(cv_test, y_test))

0.5016744809109176

0.5020080321285141


### Let's see how these models compare to a Random Forest Model

### Random Forest Model

In [42]:
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=75, min_impurity_decrease=1,
                           random_state=42)
rf.fit(cv_train, y_train)

print(rf.score(cv_train, y_train))
print('')
print(rf.score(cv_test, y_test))

0.5016744809109176

0.5020080321285141


### The best model is: ______
### Let's optimize using GridSearch

In [None]:
params = {}

gs = GridSearchCV( , param_grid=params, cv=5)

gs.fit(cv_train, y_train)
print(gs.best_score_)
gs.best_params_

print(gs.score(cv_train, y_train))
print('')
print(gs.score(cv_test, y_test))

## TfidfVectorize our data

In [43]:
tf = TfidfVectorizer(stop_words='english',      # Instantiate english stop words
                    lowercase=True,             # All words to lowercase if not already
                    min_df=10,                  # Ignore words that do not occur at least 5 times
                    ngram_range=(1,5))          # Set ngram_range

In [44]:
tf_train = tf.fit_transform(X_train)            # Fit the CountVectorizer to the training data
tf_test = tf.transform(X_test)                  # Fit the CountVectorizer to the testing data

In [45]:
# Create tf_train dataframe
tf_train_df = pd.DataFrame(tf_train.toarray(), columns = tf.get_feature_names())
tf_train_df.head()

Unnamed: 0,00,00 adida,00 adida origin,00 adida origin instagram,00 adida origin instagram icon,00 tax,00 tax includ,00 tax includ help,00 tax includ help someon,00 thoughts,...,zx flux some1,zx flux some1 help,zx flux some1 help thi,zx flux xeno,zx flux xeno worn,zx flux xeno worn kid,zx need,zx need help,zx need help id,zx need help id shoe
0,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
1,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
2,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
3,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
4,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153


In [46]:
# Create tf_test dataframe
tf_test_df = pd.DataFrame(tf_test.toarray(), columns = tf.get_feature_names())
tf_test_df.head()

Unnamed: 0,00,00 adida,00 adida origin,00 adida origin instagram,00 adida origin instagram icon,00 tax,00 tax includ,00 tax includ help,00 tax includ help someon,00 thoughts,...,zx flux some1,zx flux some1 help,zx flux some1 help thi,zx flux xeno,zx flux xeno worn,zx flux xeno worn kid,zx need,zx need help,zx need help id,zx need help id shoe
0,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
1,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
2,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
3,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153
4,0.003459,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,...,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153,0.001153


In [47]:
print(tf_train.shape)
print(tf_test.shape)

(1493, 43424)
(498, 43424)


### Let's try different models and then optimize the one that works best

### Logistic Regression Model

In [48]:
log = LogisticRegression(penalty='l2', tol=.0001, C=5, random_state=42)

log.fit(tf_train, y_train)

print(log.score(tf_train, y_train))
print('')
print(log.score(tf_test, y_test))

0.5016744809109176

0.5020080321285141


### Multinomial Naive Bayes Model

In [49]:
mn = MultinomialNB()

mn.fit(tf_train, y_train)

print(mn.score(tf_train, y_train))
print('')
print(mn.score(tf_test, y_test))

0.5016744809109176

0.5020080321285141


### Decision Tree Model

In [50]:
dT = DecisionTreeClassifier(criterion='gini', random_state=42, min_impurity_decrease=2, max_depth=100)

dT.fit(tf_train, y_train)

print(dT.score(tf_train, y_train))
print('')
print(dT.score(tf_test, y_test))

0.5016744809109176

0.5020080321285141


### Bagged Tree Model

In [51]:
bagged = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), n_estimators=100)

bagged.fit(tf_train, y_train)

print(bagged.score(tf_train, y_train))
print('')
print(bagged.score(tf_test, y_test))

0.5016744809109176

0.5020080321285141


### Extra Trees Model

In [None]:
eT = ExtraTreesClassifier(max_depth=100, min_impurity_decrease=2, 
                          random_state=42, criterion='entropy', n_estimators=100)

eT.fit(tf_train, y_train)

print(eT.score(tf_train, y_train))
print('')
print(eT.score(tf_test, y_test))

### Let's see how these models compare to a Random Forest Model

### Random Forest Model

In [None]:
rF = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=75, min_impurity_decrease=1,
                           random_state=42)
rF.fit(tf_train, y_train)

print(rF.score(tf_train, y_train))
print('')
print(rF.score(tf_test, y_test))

### The best model is: ______
### Let's optimize using GridSearch

In [None]:
params = {}

gS = GridSearchCV( , param_grid=params, cv=5)

gS.fit(tf_train, y_train)
print(gs.best_score_)
gS.best_params_

print(gS.score(tf_train, y_train))
print('')
print(gS.score(tf_test, y_test))