# Urban vs. rural, combined model

In [1]:
import ujson
import numpy as np
import re
import os

from tqdm import tqdm_notebook
from glob import glob

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
def read_tweets(pattern, fcount=None):
    
    paths = glob(pattern)
    
    if fcount:
        paths = paths[:fcount]
    
    tweets = []
    for path in paths:
        
        with open(path) as fh:
            for line in fh:
                tweet = ujson.loads(line)
                text = re.sub('(#|@|http)\S+', '', tweet['body'])
                tweets.append(text)
                
    return tweets

In [3]:
rural = read_tweets('../../data/geo-lt10k.json/*.json', 60)

In [4]:
urban = read_tweets('../../data/geo-gt1m.json/*.json', 20)

In [5]:
rural_ = rural[:5000000]

In [6]:
urban_ = urban[:5000000]

In [7]:
X = rural_ + urban_

In [8]:
y = ([0] * len(rural_)) + ([1] * len(urban_))

In [9]:
def train_model(X, y, ngram_range=(1, 1), max_features=5000):
    
    cv = CountVectorizer(
        ngram_range=ngram_range,
        max_features=max_features,
    )
    
    X = cv.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    model = LogisticRegression()
    fit = model.fit(X_train, y_train)
    
    y_pred = fit.predict(X_test)
    
    report = classification_report(
        y_test, y_pred, target_names=('rural', 'urban'),
    )
    
    print(report)
    
    return cv, model

In [10]:
def print_rural(model, names, n=100):
    idxs = model.coef_[0].argsort()
    for idx in idxs[:n]:
        print(model.coef_[0][idx], names[idx])

In [11]:
def print_urban(model, names, n=100):
    idxs = np.flip(model.coef_[0].argsort(), 0)
    for idx in idxs[:n]:
        print(model.coef_[0][idx], names[idx])

In [12]:
cv, model = train_model(X, y, (1, 3), 5000)

             precision    recall  f1-score   support

      rural       0.57      0.63      0.60   1250201
      urban       0.59      0.52      0.55   1249799

avg / total       0.58      0.58      0.58   2500000



In [13]:
names = cv.get_feature_names()

In [14]:
print_rural(model, names)

-3.0620563895 wind mph
-2.25352476211 just checked in
-1.92349184241 in humidity
-1.62062621246 varsity
-1.56009539182 softball
-1.53803115317 0mph
-1.51296762932 jv
-1.51062805791 lady gaga
-1.51043106102 honeoye
-1.50552257552 new photo to
-1.4521541186 more and more
-1.3989904313 mt
-1.37000092702 and people unfollowed
-1.35647980027 temp
-1.35128636105 q106 country
-1.35128636105 on q106
-1.35128636105 on q106 country
-1.35128636105 is on q106
-1.34811410741 edt
-1.33945601818 honeoye falls
-1.23880017948 honeoye falls ny
-1.23880017948 falls ny weather
-1.18132507037 00 in
-1.17460225649 listing
-1.16150198451 county
-1.14682421764 gust
-1.1465649779 available until
-1.00746408354 forecast
-0.988593857706 daily thanks to
-0.977892921336 fishing
-0.949679969394 earning
-0.924271719527 mrs
-0.902792611651 more for
-0.848868668746 now playing
-0.837856044103 liked video
-0.836248957158 and one person
-0.827108879237 listen to
-0.802570356508 yasss it time
-0.801439322328 rain today 0

In [15]:
print_urban(model, names)

5.43171810903 flight spotted
2.58226917302 has appeared
1.89327706226 gt lt
1.6822911971 chicago
1.58639506052 az
1.5791550749 ny weather
1.55490872429 houston
1.55121115492 philadelphia
1.53096597677 in ca
1.49252368882 can you recommend
1.45822027227 temperature
1.44261830194 in tx
1.43535342126 incident
1.4023625555 angeles
1.39224071814 dallas
1.38690433386 wire
1.29260366612 gt gt
1.28832214387 il
1.28118779423 brooklyn
1.28043585478 phoenix
1.2381390856 followed me and
1.1735624452 lmaoo
1.16200662445 nyc
1.12209469622 lmaoooo
1.11751729313 tix
1.10435111042 ca
1.08907364759 lmaooo
1.08027349769 arizona
1.05681955496 lakers
1.05146749445 philly
1.03009858074 mixtape
0.991154824973 ave
0.985268724958 bulls
0.951521955975 today 00 in
0.932210441335 spurs
0.930252554106 san diego
0.911461545956 lmfao
0.910003205551 click for details
0.907742240505 cubs
0.906062397312 trending
0.89884979869 mfs
0.896729318412 articles
0.891352801981 submit
0.891065825365 listen
0.889277551982 humidit