In [127]:
import ujson
import numpy as np
import re

from tqdm import tqdm_notebook
from glob import glob

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [99]:
def read_tweets(pattern):
    
    tweets = []
    for path in glob(pattern):
        with open(path) as fh:
            for line in tqdm_notebook(fh):
                tweet = ujson.loads(line)
                text = re.sub('(#|@|http)\S+', '', tweet['body'])
                tweets.append(text)
                
    return tweets

In [100]:
rural = read_tweets('../../data/geo-lt10k.json/part-0000*')































In [101]:
urban = read_tweets('../../data/geo-gt1m.json/part-0000*')































In [103]:
X = rural + urban

In [104]:
y = ([-1] * len(rural)) + ([1] * len(urban))

In [105]:
len(y)

8499039

In [108]:
cv = CountVectorizer(max_features=5000)

In [109]:
X = cv.fit_transform(X)

In [110]:
X

<8499039x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 60338933 stored elements in Compressed Sparse Row format>

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [112]:
model = LogisticRegression().fit(X_train, y_train)

In [113]:
y_pred = model.predict(X_test)

In [114]:
f1_score(y_test, y_pred)

0.85360321764462765

In [128]:
accuracy_score(y_test, y_pred)

0.74710037839567767

In [115]:
model.coef_

array([[-0.44186456, -0.00806768,  0.66545517, ...,  0.0022634 ,
        -0.0267197 , -0.16827428]])

In [116]:
names = cv.get_feature_names()

In [117]:
len(names)

5000

In [118]:
rural_idxs = model.coef_[0].argsort()

In [125]:
for idx in rural_idxs[:200]:
    print(model.coef_[0][idx], names[idx])

-2.92395005101 honeoye
-1.75409972724 gust
-1.71168189639 barometer
-1.57260493008 varsity
-1.52280689246 softball
-1.49325144942 jv
-1.46019827957 earning
-1.41942667461 mt
-1.15582969952 county
-1.11825320593 volleyball
-1.08586255935 edt
-1.08231119509 listing
-1.05809574967 hum
-1.0494358308 nc
-0.993028871007 fishing
-0.987780744305 seniors
-0.966698144091 temp
-0.948708919993 tigers
-0.933649193292 mrs
-0.892452710379 kentucky
-0.858793592506 00inches
-0.845805078512 iowa
-0.831447143423 0mph
-0.811833518591 ct
-0.810181157637 forecast
-0.807960805247 creek
-0.802213969697 psa
-0.789060108767 coins
-0.77291222652 serial
-0.752462622716 lions
-0.741123947002 farm
-0.736960075129 unknown
-0.716160286624 visits
-0.710025761661 tn
-0.706479807732 halftime
-0.697658167813 storms
-0.687811255708 ford
-0.674680581052 gaga
-0.669157992492 patch
-0.668649017773 racing
-0.658019023152 michigan
-0.657738614243 regional
-0.656572988387 georgia
-0.656404404718 rewards
-0.650069842053 thankful

In [120]:
urban_idxs = np.flip(model.coef_[0].argsort(), 0)

In [126]:
for idx in urban_idxs[:200]:
    print(model.coef_[0][idx], names[idx])

4.2619756395 gmt
3.70744543294 helloo
3.57815827875 fornadine
2.80640933171 node
2.77747956847 adderall
1.86321317215 nigerian
1.77560003293 az
1.77491596859 asl
1.75615688029 chicago
1.67602691932 angeles
1.64713557651 philadelphia
1.56568068173 houston
1.53403015353 ca
1.50323156553 lmfaooooo
1.50121767388 lmfaoooo
1.45237107975 dallas
1.43519594317 astros
1.41047493333 wire
1.40044975357 spotted
1.35879233739 dodgers
1.33944279435 startup
1.32980584007 lmfaooo
1.32565975183 lmfaoo
1.29980037498 lmaoooo
1.29906486072 brooklyn
1.29819023793 il
1.25429416094 phoenix
1.24913265807 lmaooo
1.23353600992 trains
1.22454921104 tx
1.20678897133 diego
1.1983502237 lmaoo
1.17770521928 blvd
1.17218758487 shorty
1.15625504389 nyt
1.15215711634 nyc
1.12954496984 ucla
1.12464771154 incident
1.0848002217 mixtape
1.08379746473 arizona
1.08091189882 pill
1.04478482607 articles
1.04274956581 lyft
1.04059798071 tix
1.04013969858 lmaooooo
1.02982078827 snorlax
1.02202568826 submit
1.00060914536 philly
0.