In [1]:
import time
import requests
import pandas as pd
import numpy as np
import enchant
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
df = pd.read_csv('cleaned_df.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,text,iphone_subreddit
0,0,Please help me understand this SIM lock scenario.,1
1,1,What iPhone should I get for under $800?,1
2,2,Spotlight Not Working,1
3,3,Battery replacement,1
4,4,I accidentally messaged two people at once in ...,1
...,...,...,...
19995,19995,"Vivo IQoo U1 Price In India November 2020, Rea...",0
19996,19996,Got a phone from my dad that needs HELP!,0
19997,19997,Mine Bitcoin on the go Mining on mobile device...,0
19998,19998,Razr 5G,0


In [4]:
df = df.drop(columns='Unnamed: 0')

In [5]:
df

Unnamed: 0,text,iphone_subreddit
0,Please help me understand this SIM lock scenario.,1
1,What iPhone should I get for under $800?,1
2,Spotlight Not Working,1
3,Battery replacement,1
4,I accidentally messaged two people at once in ...,1
...,...,...
19995,"Vivo IQoo U1 Price In India November 2020, Rea...",0
19996,Got a phone from my dad that needs HELP!,0
19997,Mine Bitcoin on the go Mining on mobile device...,0
19998,Razr 5G,0


In [6]:
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(df['text'])
df_words  = pd.DataFrame(tvec.transform(df['text']).todense(),
                   columns=tvec.get_feature_names())

In [7]:
df_words

Unnamed: 0,00,000,000mah,002,00am,00srock,01,02,03,04,...,コスプレ靴,在庫発売,단독,𝗔𝗡𝗗𝗥𝗢𝗜𝗗,𝗔𝗣𝗣,𝗘𝗔𝗥𝗧𝗛,𝗠𝗘𝗦𝗦𝗔𝗚𝗜𝗡𝗚,𝗠𝗢𝗦𝗧,𝗢𝗡,𝗦𝗘𝗖𝗨𝗥𝗘
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
cvec = CountVectorizer(stop_words='english')
cvec.fit(df['text'])
df_word_vec = pd.DataFrame(cvec.transform(df['text']).todense(), columns=cvec.get_feature_names())
df_word_vec

Unnamed: 0,00,000,000mah,002,00am,00srock,01,02,03,04,...,コスプレ靴,在庫発売,단독,𝗔𝗡𝗗𝗥𝗢𝗜𝗗,𝗔𝗣𝗣,𝗘𝗔𝗥𝗧𝗛,𝗠𝗘𝗦𝗦𝗔𝗚𝗜𝗡𝗚,𝗠𝗢𝗦𝗧,𝗢𝗡,𝗦𝗘𝗖𝗨𝗥𝗘
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_word_vec.sum().sort_values(ascending=False).head(25)

iphone     5403
android    2454
12         2198
phone      1914
pro        1704
app        1529
new        1138
google     1058
max         989
11          981
apple       966
samsung     961
help        947
screen      938
galaxy      748
battery     714
ios         666
just        646
apps        616
does        599
camera      562
best        558
way         487
case        482
vs          443
dtype: int64

In [10]:
df.isnull().sum()

text                0
iphone_subreddit    0
dtype: int64

In [11]:
df_word_vec.isnull().sum().sum()

0

In [12]:
# Josh shared idea to use pyenchant to filter out non english words.
d = enchant.Dict("en_US")

In [13]:
drop_words_list = []
for value in df_word_vec.columns:
    if d.check(value.lower()) == False:
        drop_words_list.append(value)

In [14]:
drop_word_list = []
for value in df_words.columns:
    if d.check(value.lower()) == False:
        drop_word_list.append(value)

In [15]:
df_word_vec = df_word_vec.drop(columns=drop_words_list)

In [16]:
df_words = df_words.drop(columns=drop_words_list)

In [17]:
X = df_words
y = df['iphone_subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6, stratify=y)

In [18]:
logreg = LogisticRegression()

In [19]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [20]:
logreg.score(X_test, y_test)

0.7882

In [21]:
rfc = RandomForestClassifier()

In [22]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [23]:
rfc.score(X_test, y_test)

0.781

In [24]:
df_word_vec.sum().sort_values(ascending=False).head(50)

android     2454
phone       1914
pro         1704
app         1529
new         1138
google      1058
max          989
apple        966
help         947
screen       938
galaxy       748
battery      714
just         646
apps         616
does         599
camera       562
best         558
way          487
case         482
vs           443
use          442
amp          437
need         422
phones       419
pixel        399
update       399
using        393
like         388
support      387
note         377
know         375
photos       375
time         361
download     360
free         357
question     348
issue        342
mini         338
video        322
mobile       311
ultra        310
play         300
com          295
buy          292
thread       285
data         280
old          271
charging     270
device       270
music        267
dtype: int64