In [1]:
import ujson
import numpy as np
import re

from tqdm import tqdm_notebook
from glob import glob

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
def read_tweets(pattern):
    
    tweets = []
    for path in glob(pattern):
        with open(path) as fh:
            for line in tqdm_notebook(fh):
                tweet = ujson.loads(line)
                text = re.sub('(#|@|http)\S+', '', tweet['body'])
                tweets.append(text)
                
    return tweets

In [3]:
rural = read_tweets('../../data/geo-lt10k.json/part-0000*')































In [4]:
urban = read_tweets('../../data/geo-gt1m.json/part-0000*')































In [5]:
X = rural + urban

In [6]:
y = ([-1] * len(rural)) + ([1] * len(urban))

In [7]:
len(y)

8499039

In [8]:
cv = CountVectorizer(max_features=5000, ngram_range=(3, 3))

In [9]:
X = cv.fit_transform(X)

In [10]:
X

<8499039x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 4496483 stored elements in Compressed Sparse Row format>

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
model = LogisticRegression().fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

In [14]:
f1_score(y_test, y_pred)

0.85348362906263397

In [15]:
model.coef_

array([[ 0.23780918, -0.33597919,  0.12097766, ..., -0.19204023,
         0.11545383,  3.38003774]])

In [16]:
names = cv.get_feature_names()

In [17]:
len(names)

5000

In [18]:
rural_idxs = model.coef_[0].argsort()

In [19]:
for idx in rural_idxs[:200]:
    print(model.coef_[0][idx], names[idx])

-6.34903782771 on radio danz
-5.18137840584 page received at
-4.74381515143 what beautiful world
-3.38853001433 article of the
-3.35343438007 pm forecast tonight
-3.33421508969 is on q106
-3.33421508969 on q106 country
-3.31398445897 in falling temperature
-3.19835451982 on jacket radio
-3.19835451982 playing on jacket
-3.1877918099 falls ny weather
-3.1877918099 honeoye falls ny
-3.13240542273 wind speed 0mph
-3.12374535135 twitter weather data
-3.12374535135 example twitter weather
-2.98594332347 in steady temperature
-2.92048223165 wind mph barometer
-2.8873013707 larvitar has appeared
-2.86757628383 rain today 00
-2.4561407031 missed it new
-2.30322245765 love to show
-2.07664119155 less than 20
-1.82824429745 at the half
-1.82573622241 mph barometer 29
-1.81288057325 free entry all
-1.78227977389 remains in effect
-1.74995685881 to your profile
-1.73255755706 national weather service
-1.55091435101 will be closed
-1.54165407966 lt gt gt
-1.52992374593 new photo to
-1.51192724937 h

In [20]:
urban_idxs = np.flip(model.coef_[0].argsort(), 0)

In [21]:
for idx in urban_idxs[:200]:
    print(model.coef_[0][idx], names[idx])

5.35465446815 wild larvitar has
4.24406224447 flight spotted at
3.85938020085 own it now
3.76183072078 work in az
3.74743600844 10 minute guide
3.73351771281 clean tune in
3.71310986295 popular on 500px
3.6848837289 miles away traveling
3.60029586379 now on air
3.5737900737 listen now submit
3.49629657934 usa pls rt
3.45816418018 hosted by djs
3.40912378159 abuse unresolved for
3.38003774018 更新 哈利波特 石內卜才新婚娶初戀今病逝
3.34478274323 no prescription needed
3.33987877272 news search businesses
3.27811895811 cleared incident on
3.27703931468 sharing the latest
3.26646380069 available at acc
3.25661880764 new project that
3.24908848236 incident on at
3.22972019726 division responding to
3.14011534491 los angeles california
3.13665565617 this in ca
3.11559665893 on both directions
3.11527336826 san diego ca
2.96907595721 nike air jordan
2.96359334068 this in tx
2.9629897444 opening here ca
2.96210441483 of her at
2.94576797432 it shirt get
2.9273618702 listen now at
2.87294793319 now trending in
2

- nicki minaj