# Urban vs. rural, MA

In [1]:
import ujson
import numpy as np
import re
import os

from tqdm import tqdm_notebook
from glob import glob

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
def read_tweets(pattern, fcount=None):
    
    paths = glob(pattern)
    
    if fcount:
        paths = paths[:fcount]
    
    tweets = []
    for path in paths:
        
        with open(path) as fh:
            for line in fh:
                tweet = ujson.loads(line)
                text = re.sub('(#|@|http)\S+', '', tweet['body'])
                tweets.append(text)
                
    return tweets

In [4]:
rural = read_tweets('../../data/ma-rural.json/*.json')

In [5]:
urban = read_tweets('../../data/ma-boston.json/*.json')

In [6]:
X = rural + urban

In [8]:
y = ([0] * len(rural)) + ([1] * len(urban))

In [9]:
len(X)

6479164

In [10]:
def train_model(X, y, ngram_range=(1, 1), max_features=1000):
    
    cv = CountVectorizer(
        ngram_range=ngram_range,
        max_features=max_features,
    )
    
    X = cv.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    model = LogisticRegression()
    fit = model.fit(X_train, y_train)
    
    y_pred = fit.predict(X_test)
    
    report = classification_report(
        y_test, y_pred, target_names=('rural', 'urban'),
    )
    
    print(report)
    
    return cv, model

In [11]:
def print_rural(model, names, n=100):
    idxs = model.coef_[0].argsort()
    for idx in idxs[:n]:
        print(model.coef_[0][idx], names[idx])

In [12]:
def print_urban(model, names, n=100):
    idxs = np.flip(model.coef_[0].argsort(), 0)
    for idx in idxs[:n]:
        print(model.coef_[0][idx], names[idx])

In [13]:
ng1_cv, ng1_model = train_model(X, y, (1, 1))

             precision    recall  f1-score   support

      rural       0.60      0.09      0.15    589392
      urban       0.65      0.97      0.78   1030399

avg / total       0.63      0.65      0.55   1619791



In [17]:
ng2_cv, ng2_model = train_model(X, y, (2, 2))

             precision    recall  f1-score   support

      rural       0.61      0.07      0.12    589542
      urban       0.65      0.98      0.78   1030249

avg / total       0.63      0.65      0.54   1619791



In [18]:
ng3_cv, ng3_model = train_model(X, y, (3, 3))

             precision    recall  f1-score   support

      rural       0.70      0.04      0.08    590159
      urban       0.64      0.99      0.78   1029632

avg / total       0.67      0.64      0.52   1619791



Why does precision diverge so much in the bigram / trigram models?

# Rural unigrams

In [14]:
names1 = ng1_cv.get_feature_names()

In [15]:
print_rural(ng1_model, names1)

-2.02698872156 inhg
-1.66383952218 tune
-1.52531048562 dew
-1.45348569115 temp
-1.36326584455 meteobridge
-0.952830835429 album
-0.932218472857 automatically
-0.868415076634 00
-0.682956662167 pm
-0.604653906709 posted
-0.57879126007 boys
-0.525841403805 liked
-0.524926712575 rain
-0.505220031029 baro
-0.500786972735 sale
-0.496239287935 school
-0.471438693133 wind
-0.459794781178 article
-0.458780588326 road
-0.457470835251 congratulations
-0.444986053832 town
-0.433731409541 snow
-0.409118232202 shop
-0.407218217147 weather
-0.405409112251 playlist
-0.401209971882 drinking
-0.382123715052 area
-0.379132610414 spring
-0.378165919116 luck
-0.372609548318 lmfao
-0.36049754178 gt
-0.346299699308 august
-0.343608815658 enter
-0.342554760595 center
-0.340608711913 box
-0.334362042465 meeting
-0.332196773896 girls
-0.328813379276 added
-0.326303235192 market
-0.325667207983 local
-0.325270824553 greatest
-0.323993332625 open
-0.314681085411 space
-0.308061147943 2nd
-0.303404389935 visit
-0

# Urban unigrams

In [16]:
print_urban(ng1_model, names1)

3.80907821051 eastern
2.6412464023 wire
1.52931368697 من
1.38820291835 solo
1.15928581508 boston
1.15134329168 bruins
1.0984956611 الله
0.905394573572 recommend
0.820178877674 por
0.733627228102 details
0.718628475466 amazon
0.718367772532 celtics
0.670731439063 nba
0.668023383826 apply
0.585885668126 fit
0.563867117196 sox
0.558563774723 playing
0.547708800489 report
0.529489408663 jersey
0.528208059679 access
0.502443916391 data
0.501130236935 rt
0.486923973541 opening
0.486772367137 listen
0.477635942668 thomas
0.472956419329 york
0.467709984798 train
0.431850171025 marketing
0.418089920125 tweets
0.414858187999 brand
0.368677705268 until
0.340558712349 google
0.337254427541 99
0.334732550943 research
0.321364852326 digital
0.32018591512 tickets
0.313750593826 sales
0.309864852742 tech
0.300210803256 case
0.296698697801 says
0.283155205827 content
0.277900752873 live
0.274708484886 hi
0.274217660092 radio
0.270864441348 men
0.268758868614 la
0.2660689556 latest
0.26027752066 stream


# Rural bigrams

In [19]:
names2 = ng2_cv.get_feature_names()

In [20]:
print_rural(ng2_model, names2)

-4.9340273553 dew point
-2.91335962289 the album
-2.68794271734 pinned to
-2.0978639792 tune in
-1.75339365439 open house
-1.59950809549 00 pm
-1.58551285047 added video
-1.40644332212 automatically checked
-1.35279313274 the market
-1.23196105135 inhg via
-1.23196105135 via meteobridge
-1.2074298267 for sale
-1.17652872951 click for
-0.883216718488 to facebook
-0.850575658896 people in
-0.834859586663 2017 at
-0.830861400954 for chance
-0.672094847698 liked video
-0.671677015551 happy birthday
-0.62814251575 your favorite
-0.627562509675 lt gt
-0.623124384397 high school
-0.621264387894 posted new
-0.604699965477 gt gt
-0.599055457178 baro 29
-0.552269319657 rain 00in
-0.552269319657 00in baro
-0.526375078164 0mph rain
-0.523600216545 see our
-0.516465340455 best of
-0.509924601434 real estate
-0.485164936393 help you
-0.48318813798 the day
-0.480107414414 posted photo
-0.467424307182 ma and
-0.458248176333 more for
-0.437743154587 me automatically
-0.436969153759 click to
-0.42072732

# Urban bigrams

In [21]:
print_urban(ng2_model, names2)

6.12536510851 us eastern
5.54221168615 solo por
5.43281087334 com gt
5.31267103533 via amazon
4.71225490013 report at
4.11133515366 listen live
3.22432609206 boston celtics
2.95267707896 boston bruins
1.57853616109 boston ma
1.47889395342 2017 us
1.46435557936 on amp
1.42935787839 isaiah thomas
1.33202311974 for details
1.18417333517 now playing
1.08801056319 and click
1.0796628803 re click
1.07083180932 via nyt
1.02970817563 ma check
0.944211140802 latest ma
0.883205644034 in boston
0.864872595202 great fit
0.807049147626 live on
0.699560528616 recommend anyone
0.69095971222 red sox
0.616896166718 to boston
0.607276457083 gt lt
0.594191771918 opening here
0.564448485736 here ma
0.560135177117 and on
0.547614018277 checked by
0.546488625127 how are
0.540886374871 ma click
0.533918870695 it here
0.529796424789 the boston
0.499540994821 now on
0.467247919867 chance of
0.453036420769 for work
0.412577489879 daily thanks
0.410070872772 ma view
0.37775239953 see it
0.372320848403 video to
0

# Rural trigrams

In [22]:
names3 = ng3_cv.get_feature_names()

In [23]:
print_rural(ng3_model, names3)

-5.57083507864 on radio danz
-3.63054970132 wind mph barometer
-3.34518541373 rain today 00
-3.280014639 is on q106
-3.280014639 on q106 country
-3.1255147782 falls ny weather
-3.1255147782 honeoye falls ny
-3.06769921745 example twitter weather
-3.06769921745 twitter weather data
-3.01303028592 in steady temperature
-2.88289683454 wind speed 0mph
-2.58945415774 national weather service
-1.99928366957 today 00 in
-1.52307186509 more and more
-1.48913220091 left in the
-1.46045594102 just checked in
-1.39246784115 good luck to
-1.36559091991 and people unfollowed
-1.32264994605 new photo to
-1.25231886349 out my broadcast
-1.24434024686 to your profile
-1.23078147828 lt gt gt
-1.15512673819 cast your vote
-1.14720757242 unit name chet
-1.14720757242 motion detection unit
-1.14720757242 name chet home
-1.14720757242 home date time
-1.14720757242 chet home date
-1.14720757242 detection unit name
-1.10396502787 now available for
-1.09330066176 happy birthday hope
-1.07049714751 video to fa

# Urban trigrams

In [24]:
print_urban(ng3_model, names3)

4.90555537594 flight spotted at
4.7252154362 10 minute guide
4.53995590192 miles away traveling
3.52502750963 airlines flight spotted
3.23136182996 york city news
2.67151760388 work in ca
2.50413651048 in san diego
2.34591697654 in los angeles
2.1258235665 post your tweet
2.09473678644 surely interest you
2.09473678644 should surely interest
2.09473678644 this should surely
2.02063228519 your tweet also
2.02063228519 tweet also at
1.89277433479 can you recommend
1.51547029993 com gt gt
1.45381027246 click to apply
1.42684169082 00 humidity is
1.42684169082 gmt 0000 utc
1.42684169082 current temperature is
1.42684169082 0000 utc current
1.42684169082 utc current temperature
1.39531984685 you are looking
1.23361951049 followed me and
1.13122187042 new video to
1.11397250174 checked in at
1.06937498641 link in bio
0.935476410747 latest opening here
0.928509021713 in with download
0.867254586975 click for details
0.842969446075 00 in humidity
0.784023203742 the 10 minute
0.776872207472 con