In [1]:
import ujson
import numpy as np
import re

from tqdm import tqdm_notebook
from glob import glob

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
def read_tweets(pattern):
    
    tweets = []
    for path in glob(pattern):
        with open(path) as fh:
            for line in tqdm_notebook(fh):
                tweet = ujson.loads(line)
                text = re.sub('(#|@|http)\S+', '', tweet['body'])
                tweets.append(text)
                
    return tweets

In [3]:
rural = read_tweets('../../data/geo-lt10k.json/part-0000*')































In [4]:
urban = read_tweets('../../data/geo-gt1m.json/part-0000*')































In [5]:
X = rural + urban

In [6]:
y = ([-1] * len(rural)) + ([1] * len(urban))

In [8]:
len(y)

8499039

In [9]:
cv = CountVectorizer(max_features=5000, ngram_range=(2, 2))

In [10]:
X = cv.fit_transform(X)

In [11]:
X

<8499039x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 18775499 stored elements in Compressed Sparse Row format>

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
model = LogisticRegression().fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)

In [15]:
f1_score(y_test, y_pred)

0.85295312919651989

In [16]:
model.coef_

array([[ 1.46348103, -2.43904233, -1.07456833, ..., -0.49804595,
        -0.47050567, -0.58310906]])

In [17]:
names = cv.get_feature_names()

In [18]:
len(names)

5000

In [19]:
rural_idxs = model.coef_[0].argsort()

In [20]:
for idx in rural_idxs[:200]:
    print(model.coef_[0][idx], names[idx])

-4.03738189656 wind speed
-3.97953232651 wind mph
-2.94907616178 q106 country
-2.94907616178 on q106
-2.79235234374 honeoye falls
-2.43904232814 00 in
-2.30791088353 falls ny
-1.98082987311 on radio
-1.7096884044 earning in
-1.61619590978 new story
-1.28978556463 severe thunderstorm
-1.26000866021 unknown unknown
-1.24471368045 chance of
-1.15381435744 your vote
-1.14398422344 left in
-1.14306957294 the half
-1.0826556172 rain today
-1.07456833293 00 pm
-1.03757683134 with download
-1.03292384096 lt gt
-1.02925405258 more and
-1.00653715499 lady gaga
-0.989923521926 congratulations to
-0.956384440191 available until
-0.940624135202 appeared available
-0.9315077964 win over
-0.919804553229 temperature is
-0.911046823897 love ya
-0.900920922881 recent follow
-0.875891039981 rain last
-0.862167256578 great job
-0.858970149391 thankful for
-0.850950255267 your profile
-0.849103589672 of july
-0.832282435284 bottom of
-0.824198318109 this afternoon
-0.806720172239 this evening
-0.7730190886

- weather
- lady gaga
- prayer / the lord
- high school sports
- love you / love ya
- best friend
- when - this afternoon, this evening

In [21]:
urban_idxs = np.flip(model.coef_[0].argsort(), 0)

In [22]:
for idx in urban_idxs[:200]:
    print(model.coef_[0][idx], names[idx])

4.40770230036 new node
4.3414131493 related articles
4.12172228575 found til
4.08028991105 news search
4.03161973699 flight spotted
3.81339071695 minute guide
3.77052724565 incident on
3.60264243759 trfc collision
2.88681189817 in il
2.72188275326 natural hair
2.54888152125 listen now
2.46595864817 in tx
2.3919713729 air jordan
2.32595846254 has appeared
2.3169274203 tweet also
2.3161983935 been broken
2.298593227 houston tx
2.29316119476 los angeles
2.2836142745 in az
2.27975780934 via twitter
2.27227232952 ca this
2.24601128442 ca check
2.22258920531 ca view
2.21277343363 trending in
2.15239089203 san diego
2.14969645554 pls rt
2.12526693353 dallas cowboys
2.04680095779 our growing
1.98220665741 chicago il
1.97964585606 should surely
1.96308000934 surely interest
1.95851601948 san jose
1.84093295906 ca click
1.75849872445 in chicago
1.74655228496 click link
1.72539682229 san antonio
1.69312355108 in dallas
1.68922710387 via nyt
1.65202313695 to chicago
1.590605582 in houston
1.566563

- nicki minaj
- race - "white people" "black people"