In [1]:
import time
import requests
import pandas as pd
import numpy as np
import enchant
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
df = pd.read_csv('cleaned_df.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,text,iphone_subreddit
0,0,Please help me understand this SIM lock scenario.,1
1,1,What iPhone should I get for under $800?,1
2,2,Spotlight Not Working,1
3,3,Battery replacement,1
4,4,I accidentally messaged two people at once in ...,1
...,...,...,...
19995,19995,"Vivo IQoo U1 Price In India November 2020, Rea...",0
19996,19996,Got a phone from my dad that needs HELP!,0
19997,19997,Mine Bitcoin on the go Mining on mobile device...,0
19998,19998,Razr 5G,0


In [4]:
df = df.drop(columns='Unnamed: 0')

In [5]:
df

Unnamed: 0,text,iphone_subreddit
0,Please help me understand this SIM lock scenario.,1
1,What iPhone should I get for under $800?,1
2,Spotlight Not Working,1
3,Battery replacement,1
4,I accidentally messaged two people at once in ...,1
...,...,...
19995,"Vivo IQoo U1 Price In India November 2020, Rea...",0
19996,Got a phone from my dad that needs HELP!,0
19997,Mine Bitcoin on the go Mining on mobile device...,0
19998,Razr 5G,0


In [6]:
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(df['text'])
df_words  = pd.DataFrame(tvec.transform(df['text']).todense(),
                   columns=tvec.get_feature_names())

In [7]:
df_words

Unnamed: 0,00,000,000mah,002,00am,00srock,01,02,03,04,...,コスプレ靴,在庫発売,단독,𝗔𝗡𝗗𝗥𝗢𝗜𝗗,𝗔𝗣𝗣,𝗘𝗔𝗥𝗧𝗛,𝗠𝗘𝗦𝗦𝗔𝗚𝗜𝗡𝗚,𝗠𝗢𝗦𝗧,𝗢𝗡,𝗦𝗘𝗖𝗨𝗥𝗘
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Josh shared idea to use pyenchant to filter out non english words.
d = enchant.Dict("en_US")

In [9]:
drop_word_list = []
for value in df_words.columns:
    if d.check(value.lower()) == False:
        drop_word_list.append(value)

In [10]:
df_words = df_words.drop(columns=drop_word_list)

In [11]:
df = pd.concat([df, df_words], axis=1)

In [12]:
df

Unnamed: 0,text,iphone_subreddit,ab,ability,able,abnormal,abnormalities,abnormally,abroad,absentee,...,young,yrs,zero,zip,zombie,zone,zones,zoom,zoomed,zooming
0,Please help me understand this SIM lock scenario.,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,What iPhone should I get for under $800?,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Spotlight Not Working,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Battery replacement,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,I accidentally messaged two people at once in ...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,"Vivo IQoo U1 Price In India November 2020, Rea...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,Got a phone from my dad that needs HELP!,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,Mine Bitcoin on the go Mining on mobile device...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,Razr 5G,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_words.sum().sort_values(ascending=False).head(25)

android     505.305574
phone       419.559786
pro         385.295924
help        331.525508
app         328.729286
new         265.599257
max         258.563033
google      249.091493
screen      239.675803
apple       221.409162
battery     201.506684
galaxy      185.470713
apps        168.862746
best        158.913463
camera      154.484032
does        152.722644
question    138.968372
case        138.384727
need        137.355743
just        134.872390
vs          131.140217
way         123.505187
use         122.432009
update      121.309036
pixel       119.411951
dtype: float64

In [48]:
X = df_words
y = df['iphone_subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6, stratify=y)

In [49]:
y.value_counts(normalize=True)

1    0.5
0    0.5
Name: iphone_subreddit, dtype: float64

In [50]:
logreg = LogisticRegression()

In [51]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [52]:
logreg.score(X_test, y_test)

0.7882

In [53]:
logreg.score(X_train, y_train)

0.8441333333333333

In [41]:
rfc = RandomForestClassifier(n_estimators=200, max_features='sqrt')

In [42]:
rfc.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', n_estimators=200)

In [43]:
rfc.score(X_test, y_test)

0.784

In [44]:
rfc.score(X_train, y_train)

0.9853333333333333

In [24]:
df_words.sum().sort_values(ascending=False).head(50)

android          505.305574
phone            419.559786
pro              385.295924
help             331.525508
app              328.729286
new              265.599257
max              258.563033
google           249.091493
screen           239.675803
apple            221.409162
battery          201.506684
galaxy           185.470713
apps             168.862746
best             158.913463
camera           154.484032
does             152.722644
question         138.968372
case             138.384727
need             137.355743
just             134.872390
vs               131.140217
way              123.505187
use              122.432009
update           121.309036
pixel            119.411951
mini             119.090090
photos           117.132719
phones           110.989840
download         106.217692
issue            104.778220
using            102.256226
free             102.134889
time              99.879130
know              97.655848
like              96.660518
video             94

In [25]:
df = df.drop(columns='text')

In [26]:
df[df['iphone_subreddit'] == 1].sum().sort_values(ascending=False).head(26)

iphone_subreddit    10000.000000
pro                   319.952614
max                   240.224400
apple                 183.130960
help                  172.306875
screen                151.983554
phone                 150.996506
battery               150.041389
new                   145.778283
case                  129.992048
app                   123.224460
does                  112.182696
mini                  110.742014
just                   89.990880
camera                 85.292271
question               85.184985
issue                  77.910213
photos                 72.890828
need                   70.480377
charger                69.532450
charging               67.959369
way                    66.652626
time                   66.353044
know                   65.986677
issues                 64.022332
apps                   60.383956
dtype: float64

In [27]:
df[df['iphone_subreddit'] == 0].sum().sort_values(ascending=False).head(25)

android       454.207995
phone         268.563280
google        226.179126
app           205.504826
galaxy        175.028487
help          159.218633
new           119.820974
pixel         108.832940
apps          108.478790
best          101.055533
download       91.388262
phones         90.947134
screen         87.692248
ultra          82.989701
vs             82.797291
update         82.627654
play           79.265949
note           73.491536
camera         69.191762
need           66.875366
use            66.251966
pro            65.343309
mobile         62.966375
smartphone     59.585794
free           57.253250
dtype: float64