In [1]:
import time
import requests
import pandas as pd
import numpy as np
import enchant
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('cleaned_df.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,text,iphone_subreddit
0,0,What's App Slow on iPhone compared to Android,1
1,1,Iphone X expansive offer (Email Submit).,1
2,2,"Got my first job, saved up for over a year, an...",1
3,3,Get S&amp; P iPhone XS,1
4,4,Iphone 12 mini standby issues,1
...,...,...,...
19995,19995,[SELF] Recomendo - The app that gives tv shows...,0
19996,19996,Ticwatch Pro 3 Review: Wear OS Finally Works!,0
19997,19997,[Mr. Mobile] Ticwatch Pro 3 Review: Wear OS Fi...,0
19998,19998,Ticwatch Pro 3 Review: Wear OS Finally Works!,0


In [4]:
df = df.drop(columns='Unnamed: 0')

In [5]:
df

Unnamed: 0,text,iphone_subreddit
0,What's App Slow on iPhone compared to Android,1
1,Iphone X expansive offer (Email Submit).,1
2,"Got my first job, saved up for over a year, an...",1
3,Get S&amp; P iPhone XS,1
4,Iphone 12 mini standby issues,1
...,...,...
19995,[SELF] Recomendo - The app that gives tv shows...,0
19996,Ticwatch Pro 3 Review: Wear OS Finally Works!,0
19997,[Mr. Mobile] Ticwatch Pro 3 Review: Wear OS Fi...,0
19998,Ticwatch Pro 3 Review: Wear OS Finally Works!,0


In [6]:
### Setting up Porter stemmer on our df['text'] column to filter down word vectors in tf-idf

In [7]:
stop = stopwords.words('english')

In [8]:
port = PorterStemmer()
stemmed_list = []
for value in df['text']:
    stemmed_ = ''
    for word in value.split(' '):
        stem = port.stem(word.strip('.!?,')) + ' '
        stemmed_ += stem
    stemmed_list.append(stemmed_)

In [9]:
df['text'] = stemmed_list

In [10]:
df['text']

0               what' app slow on iphon compar to android 
1                     iphon X expans offer (email submit) 
2        got my first job save up for over a year and v...
3                                   get s&amp; P iphon XS 
4                              iphon 12 mini standbi issu 
                               ...                        
19995    [self] recomendo - the app that give tv show r...
19996           ticwatch pro 3 review: wear OS final work 
19997    [mr mobile] ticwatch pro 3 review: wear OS fin...
19998           ticwatch pro 3 review: wear OS final work 
19999    [mr mobile] ticwatch pro 3 review: wear OS fin...
Name: text, Length: 20000, dtype: object

In [11]:
### Setting up Tf-Idf vectorizer for word count breakdown for posts

In [12]:
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(df['text'])
df_words  = pd.DataFrame(tvec.transform(df['text']).todense(),
                   columns=tvec.get_feature_names())

In [13]:
df_words

Unnamed: 0,00,000,000mah,002,00am,00srock,01,02,03,04,...,रह,你妈了个逼,단독,𝗔𝗡𝗗𝗥𝗢𝗜𝗗,𝗔𝗣𝗣,𝗘𝗔𝗥𝗧𝗛,𝗠𝗘𝗦𝗦𝗔𝗚𝗜𝗡𝗚,𝗠𝗢𝗦𝗧,𝗢𝗡,𝗦𝗘𝗖𝗨𝗥𝗘
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Josh shared idea to use pyenchant package to filter out non english words from our dataset.
d = enchant.Dict("en_US")

In [15]:
drop_word_list = []
for value in df_words.columns:
    if d.check(value.lower()) == False:
        drop_word_list.append(value)

In [16]:
df_words = df_words.drop(columns=drop_word_list)

In [17]:
### Combining vectorized output dataframe with original

In [18]:
df = pd.concat([df, df_words], axis=1)

In [19]:
df

Unnamed: 0,text,iphone_subreddit,ab,abandon,abroad,absorb,absurd,accent,accept,access,...,yield,yo,yoga,young,yr,zero,zip,zone,zoom,zooming
0,what' app slow on iphon compar to android,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,iphon X expans offer (email submit),1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,got my first job save up for over a year and v...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,get s&amp; P iphon XS,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,iphon 12 mini standbi issu,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,[self] recomendo - the app that give tv show r...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,ticwatch pro 3 review: wear OS final work,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,[mr mobile] ticwatch pro 3 review: wear OS fin...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,ticwatch pro 3 review: wear OS final work,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
### Train/Test split on data to evaluate model accuracy and fitting

In [21]:
X = df.drop(columns=['text', 'iphone_subreddit'])
y = df['iphone_subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6, stratify=y)

In [22]:
### Baseline score for our model shown below: .50

In [23]:
y.value_counts(normalize=True)

1    0.5
0    0.5
Name: iphone_subreddit, dtype: float64

In [24]:
### Instantiating, fitting, and evaluating a LogisticRegression Model on our training and testing data

In [25]:
logreg = LogisticRegression()

In [26]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [27]:
logreg.score(X_test, y_test)

0.7568

In [28]:
logreg.score(X_train, y_train)

0.7948

In [29]:
### Instantiating, fitting, and evaluating a RandomForestClassifier Model on our training and testing data

In [30]:
rf = RandomForestClassifier(n_estimators=200, max_features='sqrt')

In [31]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', n_estimators=200)

In [32]:
rf.score(X_test, y_test)

0.7446

In [33]:
rf.score(X_train, y_train)

0.9738