In [1]:
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocessing

In [2]:
%%time
db = MongoClient('mongodb://143.215.138.132:27017')['big_data']

matchNE = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -99, '$lte': -69}}}
matchSE = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -99, '$lte': -69}}}
matchNW = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -125, '$lte': -99}}}
matchSW = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -125, '$lte': -99}}}

sentence_list = []
location_list = []

limit = {'$limit': 10000}

pipeline = [matchNE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NE')

pipeline = [matchSE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SE')

CPU times: user 578 ms, sys: 362 ms, total: 940 ms
Wall time: 1.63 s


In [3]:
# Add your own sentence here
your_sentence = "Georgia Tech is in Atlanta"
sentence_list.append(your_sentence)

In [4]:
class Tweet_Tokenizer(object):
    def __init__(self):
        self.wnl = TweetTokenizer()
    def __call__(self, doc):
        return self.wnl.tokenize(doc)

def make_features(corpus):
    vectorizer = CountVectorizer(tokenizer=Tweet_Tokenizer(), analyzer='word', min_df=0)
    return vectorizer.fit_transform(corpus), vectorizer.get_feature_names()

In [5]:
%%time
sentence_vector_list, vector_name_list = make_features(sentence_list)
your_sentence_vector = sentence_vector_list[-1]
sentence_vector_list = sentence_vector_list[:-1]

CPU times: user 1.27 s, sys: 14.5 ms, total: 1.29 s
Wall time: 1.29 s


# Training

In [6]:
training_vectors, test_vectors, training_locations, test_locations =\
    train_test_split(sentence_vector_list, location_list, test_size=0.1, random_state=999)

In [7]:
lr_clf = LogisticRegression(solver='saga', max_iter=5000)

In [8]:
%%time
lr_clf.fit(training_vectors, training_locations)

CPU times: user 8.66 s, sys: 39.2 ms, total: 8.7 s
Wall time: 8.73 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

# Results

In [9]:
%%time
predicted_test_locations = lr_clf.predict(test_vectors)
predicted_your_sentence_location = lr_clf.predict(your_sentence_vector)

CPU times: user 975 µs, sys: 641 µs, total: 1.62 ms
Wall time: 813 µs


In [10]:
print("Test set accuracy: "\
      + str(accuracy_score(test_locations, predicted_test_locations)))

Test set accuracy: 0.603


In [11]:
print("Result of your sentence: " + str(predicted_your_sentence_location))

Result of your sentence: ['SE']


# Analysis

In [12]:
theta = lr_clf.coef_[0]
weights = dict()
for vector, weight in zip(vector_name_list, theta):
    weights[vector] = weight

In [16]:
top_vectors = sorted(weights, key=lambda x:weights[x], reverse=True)[:50]
for vector_name in top_vectors:
    print(str(vector_name) + '\t' + str(weights[vector_name]))

houston	2.03916361516
@taylorswift13	1.93280219125
opening	1.78577776519
tx	1.78154344494
@negrosubversive	1.77534699241
fl	1.76651045191
@gibson326	1.75794252108
texas	1.69164415998
alabama	1.69099897328
#ghc16	1.45043592821
atlanta	1.41609021516
ion	1.3549924556
orleans	1.35029390226
@55mmbae	1.33068500622
polls	1.32811055879
mama	1.28135215608
#okctraffic	1.27782500367
austin	1.270749194
charlotte	1.25774493567
fw	1.25456685261
sc	1.22913620726
nc	1.21543019297
🤒	1.2098019497
beach	1.2004184629
@katramsland	1.19926783846
#1gottago	1.18522323322
@mycfe	1.1843731842
johnson	1.16925587698
hoe	1.16024087735
p	1.15839373215
☀	1.15799621163
whiskey	1.15503614142
folks	1.15084046049
raleigh	1.14302668419
aunt	1.14244413643
@khalifist	1.11998556428
yummy	1.11877133136
control	1.10914421088
@gavgordontogo	1.10389939288
mf	1.0966530289
bama	1.09609304313
police	1.0864919358
@thedonguru	1.08510264994
@iongviewing	1.08482170696
actions	1.0796589564
debate	1.0781678393
https://t.co/pialycdqdn	1.

In [17]:
top_vectors = sorted(weights, key=lambda x:weights[x], reverse=False)[:50]
for vector_name in top_vectors:
    print(str(vector_name) + '\t' + str(weights[vector_name]))

york	-1.57735428298
cubs	-1.57368724741
ny	-1.55998375532
#specialreport	-1.54442470785
bars	-1.44499905775
#nyc	-1.43483473118
incident	-1.40446169209
toronto	-1.38592934861
missin	-1.34203471149
washington	-1.33164338531
within	-1.32072387962
detroit	-1.31480376078
nj	-1.28324261706
easier	-1.27787772916
@dineshdsouza	-1.27743409063
rather	-1.25527050986
ebay	-1.24816515036
howard	-1.237592684
@steff__r	-1.22108434211
wet	-1.20139623436
26	-1.19576524609
blind	-1.19291275996
xo	-1.17857089658
rain	-1.16456081854
@bottomoso	-1.16395997575
dc	-1.16196697097
window	-1.15385707858
waking	-1.13980811632
democratic	-1.13090845181
boone	-1.13035143912
warm	-1.12288546166
virginia	-1.12002321119
philly	-1.11229393236
michigan	-1.10974738037
rainy	-1.10454054819
answers	-1.09252476786
nyc	-1.08823954012
goin	-1.08751517383
purdue	-1.0862995209
#ictfl16	-1.08047203777
nope	-1.0775590726
#happyfriday	-1.07660973141
chuck	-1.07247078973
@dame_lillard	-1.07150029319
hahahaha	-1.07045754172
indian