In [2]:
import time
import requests
import pandas as pd
import numpy as np
import enchant
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [3]:
df = pd.read_csv('cleaned_df.csv')

In [4]:
df

Unnamed: 0.1,Unnamed: 0,text,iphone_subreddit
0,0,What's App Slow on iPhone compared to Android,1
1,1,Iphone X expansive offer (Email Submit).,1
2,2,"Got my first job, saved up for over a year, an...",1
3,3,Get S&amp; P iPhone XS,1
4,4,Iphone 12 mini standby issues,1
...,...,...,...
19995,19995,[SELF] Recomendo - The app that gives tv shows...,0
19996,19996,Ticwatch Pro 3 Review: Wear OS Finally Works!,0
19997,19997,[Mr. Mobile] Ticwatch Pro 3 Review: Wear OS Fi...,0
19998,19998,Ticwatch Pro 3 Review: Wear OS Finally Works!,0


In [5]:
df = df.drop(columns='Unnamed: 0')

In [6]:
df

Unnamed: 0,text,iphone_subreddit
0,What's App Slow on iPhone compared to Android,1
1,Iphone X expansive offer (Email Submit).,1
2,"Got my first job, saved up for over a year, an...",1
3,Get S&amp; P iPhone XS,1
4,Iphone 12 mini standby issues,1
...,...,...
19995,[SELF] Recomendo - The app that gives tv shows...,0
19996,Ticwatch Pro 3 Review: Wear OS Finally Works!,0
19997,[Mr. Mobile] Ticwatch Pro 3 Review: Wear OS Fi...,0
19998,Ticwatch Pro 3 Review: Wear OS Finally Works!,0


In [7]:
### Setting up Porter stemmer on our df['text'] column to filter down word vectors in tf-idf

In [8]:
stop = stopwords.words('english')

In [9]:
port = PorterStemmer()
stemmed_list = []
for value in df['text']:
    stemmed_ = ''
    for word in value.split(' '):
        if word in stop:
            continue 
        else:
            stem = port.stem(word.strip('.!?,')) + ' '
            stemmed_ += stem
    stemmed_list.append(stemmed_)

In [10]:
df['text'] = stemmed_list

In [11]:
df['text']

0                     what' app slow iphon compar android 
1                     iphon X expans offer (email submit) 
2        got first job save year voila My humbl littl h...
3                                   get s&amp; P iphon XS 
4                              iphon 12 mini standbi issu 
                               ...                        
19995    [self] recomendo - the app give tv show recomm...
19996           ticwatch pro 3 review: wear OS final work 
19997    [mr mobile] ticwatch pro 3 review: wear OS fin...
19998           ticwatch pro 3 review: wear OS final work 
19999    [mr mobile] ticwatch pro 3 review: wear OS fin...
Name: text, Length: 20000, dtype: object

In [12]:
### Setting up Tf-Idf vectorizer for word count breakdown for posts

In [13]:
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(df['text'])
df_words  = pd.DataFrame(tvec.transform(df['text']).todense(),
                   columns=tvec.get_feature_names())

In [14]:
df_words

Unnamed: 0,00,000,000mah,002,00am,00srock,01,02,03,04,...,रह,你妈了个逼,단독,𝗔𝗡𝗗𝗥𝗢𝗜𝗗,𝗔𝗣𝗣,𝗘𝗔𝗥𝗧𝗛,𝗠𝗘𝗦𝗦𝗔𝗚𝗜𝗡𝗚,𝗠𝗢𝗦𝗧,𝗢𝗡,𝗦𝗘𝗖𝗨𝗥𝗘
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Josh shared idea to use pyenchant package to filter out non english words from our dataset.
d = enchant.Dict("en_US")

In [16]:
drop_word_list = []
for value in df_words.columns:
    if d.check(value.lower()) == False:
        drop_word_list.append(value)

In [17]:
df_words = df_words.drop(columns=drop_word_list)

In [18]:
### Combining vectorized output dataframe with original

In [19]:
df = pd.concat([df, df_words], axis=1)

In [20]:
df

Unnamed: 0,text,iphone_subreddit,ab,abandon,abroad,absorb,absurd,accent,accept,access,...,yield,yo,yoga,young,yr,zero,zip,zone,zoom,zooming
0,what' app slow iphon compar android,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,iphon X expans offer (email submit),1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,got first job save year voila My humbl littl h...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,get s&amp; P iphon XS,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,iphon 12 mini standbi issu,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,[self] recomendo - the app give tv show recomm...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,ticwatch pro 3 review: wear OS final work,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,[mr mobile] ticwatch pro 3 review: wear OS fin...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,ticwatch pro 3 review: wear OS final work,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
iphone_top_words = df.drop(columns='text').loc[df['iphone_subreddit'] == 1].sum().sort_values(ascending=False).head(26)[1:]

In [34]:
android_top_words = df.drop(columns='text').loc[df['iphone_subreddit'] == 0].sum().sort_values(ascending=False).head(26)[1:]

In [35]:
iphone_top_words

pro         373.743505
max         286.076368
case        177.113434
help        174.202439
screen      172.089011
phone       159.912031
app         152.529327
new         152.443291
mini        133.764541
use         120.526791
question    100.149472
photo        98.346595
camera       92.012391
work         87.956883
need         80.166206
charger      78.999650
time         74.972710
switch       71.688281
turn         70.364068
best         69.864026
buy          69.845448
know         69.512537
problem      69.352584
way          68.695265
doe          62.014736
dtype: float64

In [36]:
android_top_words

phone       337.118627
app         291.382309
help        157.320667
pixel       152.965173
new         127.495469
download    115.226010
use         110.398349
best        110.031555
play        101.439538
screen       92.929676
vs           81.968782
need         80.512497
camera       79.986682
ultra        77.361341
note         76.879823
game         76.309010
video        73.941660
review       71.316734
work         69.789489
look         67.702480
question     66.582826
pro          65.091033
music        64.289864
way          63.459994
free         59.383908
dtype: float64

In [39]:
android_top_words.to_csv('android_top_words.csv')

In [40]:
iphone_top_words.to_csv('iphone_top_words.csv')

In [21]:
### Train/Test split on data to evaluate model accuracy and fitting

In [22]:
X = df.drop(columns=['text', 'iphone_subreddit'])
y = df['iphone_subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6, stratify=y)

In [23]:
### Baseline score for our model shown below: .50

In [24]:
y.value_counts(normalize=True)

1    0.5
0    0.5
Name: iphone_subreddit, dtype: float64

In [25]:
### Instantiating, fitting, and evaluating a LogisticRegression Model on our training and testing data

In [26]:
logreg = LogisticRegression()

In [27]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [28]:
logreg.score(X_test, y_test)

0.7572

In [29]:
logreg.score(X_train, y_train)

0.7974

In [30]:
cross_val_score(logreg, X_train, y_train, cv=5).mean()

0.7510666666666667

In [31]:
### Instantiating, fitting, and evaluating a RandomForestClassifier Model on our training and testing data

In [32]:
rf = RandomForestClassifier(n_estimators=200, max_features='sqrt')

In [33]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', n_estimators=200)

In [34]:
rf.score(X_test, y_test)

0.7476

In [35]:
rf.score(X_train, y_train)

0.9734

In [36]:
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.7333333333333333

In [37]:
### Instantiating, fitting, and evaluating an ExtraTreesClassifier Model on our training and testing data

In [38]:
et = ExtraTreesClassifier(n_estimators=200, max_features='sqrt')

In [39]:
et.fit(X_train, y_train)

ExtraTreesClassifier(max_features='sqrt', n_estimators=200)

In [40]:
et.score(X_test, y_test)

0.7436

In [41]:
et.score(X_train, y_train)

0.9734

In [42]:
cross_val_score(et, X_train, y_train, cv=5).mean()

0.7331333333333333

In [43]:
parameters = {
    'n_estimators': [150, 200, 250],
    'max_depth': [None, 1, 2, 3, 4, 5]
}

In [50]:
### Using GridSearchCV to find best parameters for RandomForestClassifier 

In [44]:
grid = GridSearchCV(rf, param_grid=parameters, cv=5)

In [45]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_features='sqrt',
                                              n_estimators=200),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'n_estimators': [150, 200, 250]})

In [46]:
grid.score(X_test, y_test)

0.748

In [47]:
grid.score(X_train, y_train)

0.9734

In [49]:
grid.best_params_

{'max_depth': None, 'n_estimators': 200}

## Modeling Summary

### The best model seemed to be LogisticRegression for the dataset I collected. The RandomForest and ExtraTrees Classifiers both were overfit to the training data and scored just slightly under LogisticRegression for testing scores. Even though LogisticRegression didn't score very well on the training data compared to RFC and ETC, it still scored higher on the testing data than the other two.
