In [1]:
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocessing

In [2]:
%%time
db = MongoClient('mongodb://143.215.138.132:27017')['big_data']

matchNE = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -99, '$lte': -69}}}
matchSE = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -99, '$lte': -69}}}
matchNW = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -125, '$lte': -99}}}
matchSW = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -125, '$lte': -99}}}

sentence_list = []
location_list = []

limit = {'$limit': 100000}

pipeline = [matchNE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NE')

pipeline = [matchSE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SE')

CPU times: user 5.33 s, sys: 3.43 s, total: 8.76 s
Wall time: 22.3 s


In [3]:
# Add your own sentence here
your_sentence = "Georgia Tech is in Atlanta"
sentence_list.append(your_sentence)

In [4]:
class Tweet_Tokenizer(object):
    def __init__(self):
        self.wnl = TweetTokenizer()
    def __call__(self, doc):
        return self.wnl.tokenize(doc)

def make_features(corpus):
    vectorizer = CountVectorizer(tokenizer=Tweet_Tokenizer(), analyzer='word', min_df=0)
    return vectorizer.fit_transform(corpus), vectorizer.get_feature_names()

In [5]:
%%time
sentence_vector_list, vector_name_list = make_features(sentence_list)
your_sentence_vector = sentence_vector_list[-1]
sentence_vector_list = sentence_vector_list[:-1]

CPU times: user 11 s, sys: 110 ms, total: 11.1 s
Wall time: 11.1 s


# Training

In [6]:
training_vectors, test_vectors, training_locations, test_locations =\
    train_test_split(sentence_vector_list, location_list, test_size=0.1, random_state=999)

In [7]:
lr_clf = LogisticRegression(solver='saga', max_iter=5000)

In [8]:
%%time
lr_clf.fit(training_vectors, training_locations)

CPU times: user 2min 45s, sys: 195 ms, total: 2min 45s
Wall time: 2min 45s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

# Results

In [9]:
%%time
predicted_test_locations = lr_clf.predict(test_vectors)
predicted_your_sentence_location = lr_clf.predict(your_sentence_vector)

CPU times: user 3.02 ms, sys: 3.73 ms, total: 6.75 ms
Wall time: 5.34 ms


In [10]:
print("Test set accuracy: "\
      + str(accuracy_score(test_locations, predicted_test_locations)))

Test set accuracy: 0.6623


In [11]:
print("Result of your sentence: " + str(predicted_your_sentence_location))

Result of your sentence: ['SE']


# Analysis

In [12]:
theta = lr_clf.coef_[0]
weights = dict()
for vector, weight in zip(vector_name_list, theta):
    weights[vector] = weight

In [13]:
top_vectors = sorted(weights, key=lambda x:weights[x], reverse=True)[:50]
for vector_name in top_vectors:
    print(str(vector_name) + '\t' + str(weights[vector_name]))

tx	4.07853329354
#txhsfb	3.38232510753
fl	3.17132681582
@karn33333	3.00236066592
houston	2.9703543303
@negrosubversive	2.92659874814
jacksonville	2.8080186635
#tcprepzone	2.7828139948
#ghc16	2.77569710474
tampa	2.76449383613
#cubevenue	2.67474070071
alabama	2.60666753058
#hometeamfb	2.58899386067
orleans	2.58200502859
#alpreps	2.48388829074
orlando	2.46893300019
#dateline	2.4220914986
#dallas	2.41539083607
ga	2.35231840945
#arpreps	2.34553786214
@__kissesileft	2.30779008797
#hamildocpbs	2.30672097748
@osvarsity	2.2870243328
#memphis	2.27998207796
georgia	2.23822461308
#oklahomacity	2.22140570574
#bloodofsylas	2.22126255519
#gvlfootball	2.18690641335
greenville	2.18547116131
@do_confidence	2.15306502591
br	2.11322241535
#mspreps	2.11157120974
hhn	2.10782879629
#florida	2.10466939442
roos	2.10225661968
#wstc	2.10053095835
texas	2.07922391169
#preds	2.0637977994
fwy	2.05507209141
dallas	2.05359048919
dp	2.04919796911
huffman	2.02922513666
atlanta	2.01419714718
@lookatmechange	2.0128891525

In [14]:
top_vectors = sorted(weights, key=lambda x:weights[x], reverse=False)[:50]
for vector_name in top_vectors:
    print(str(vector_name) + '\t' + str(weights[vector_name]))

#wvprepfb	-3.09928286231
il	-2.98630014062
nj	-2.90409547636
mn	-2.87648988857
#nyc	-2.68685094284
#toronto	-2.59433638934
incident	-2.58722860407
md	-2.49590550853
@thsbluedevils	-2.48198811616
@benpage11benp	-2.46034771432
@ashcrofttom	-2.34807087679
ia	-2.29496947421
#isles	-2.25631913525
rainy	-2.21877424729
toronto	-2.21118013732
salem	-2.20715776867
wi	-2.14873555434
#wisfb	-2.14668510867
@mickersmith20	-2.10185591986
va	-2.09213754566
#chicago	-2.04235528776
#blackhawks	-2.01278413269
#iahsfb	-2.00199307237
@sandplague	-1.99957545543
pa	-1.99559640312
#nashville	-1.99095340082
cincinnati	-1.98503736814
@annaiya_ruffin	-1.95725895692
pennridge	-1.95491294989
@kalamazoowings	-1.94828433865
lebanon	-1.94367524433
@ellis_pemrick20	-1.9268225726
montreal	-1.92010115881
pittsburgh	-1.91339560506
manhattan	-1.91227509705
brooklyn	-1.91150233072
#boston	-1.90868362113
ontario	-1.89892597328
#newyork	-1.89223492544
@kanova	-1.87069255662
ks	-1.85232012596
philadelphia	-1.84920783352
midd

In [15]:
# Find more in Chapter 1 and 2 of the book below
# https://github.com/jacobeisenstein/gt-nlp-class/blob/master/notes/eisenstein-nlp-notes.pdf