In [39]:
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocessing

In [40]:
%%time
db = MongoClient('mongodb://143.215.138.132:27017')['big_data']

matchNE = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -99, '$lte': -69}}}
matchSE = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -99, '$lte': -69}}}
matchNW = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -125, '$lte': -99}}}
matchSW = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -125, '$lte': -99}}}

sentence_list = []
location_list = []

limit = {'$limit': 100000}

pipeline = [matchNE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NE')

pipeline = [matchSE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SE')

pipeline = [matchNW, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NW')

pipeline = [matchSW, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SW')

CPU times: user 11.6 s, sys: 7.5 s, total: 19.1 s
Wall time: 38.5 s


In [41]:
# Add your own sentence here
your_sentence = "Georgia Tech is in Atlanta"
sentence_list.append(your_sentence)

In [42]:
class Tweet_Tokenizer(object):
    def __init__(self):
        self.wnl = TweetTokenizer()
    def __call__(self, doc):
        return self.wnl.tokenize(doc)

def make_features(corpus):
    vectorizer = CountVectorizer(tokenizer=Tweet_Tokenizer(), analyzer='word', min_df=0)
    return vectorizer.fit_transform(corpus), vectorizer.get_feature_names()

In [43]:
%%time
sentence_vector_list, vector_name_list = make_features(sentence_list)
your_sentence_vector = sentence_vector_list[-1]
sentence_vector_list = sentence_vector_list[:-1]

CPU times: user 21.8 s, sys: 231 ms, total: 22.1 s
Wall time: 22 s


# Training

In [44]:
training_vectors, test_vectors, training_locations, test_locations =\
    train_test_split(sentence_vector_list, location_list, test_size=0.1, random_state=999)

In [45]:
lr_clf = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=5000)

In [46]:
%%time
lr_clf.fit(training_vectors, training_locations)

CPU times: user 9min 57s, sys: 665 ms, total: 9min 57s
Wall time: 9min 57s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

# Results

In [47]:
%%time
predicted_test_locations = lr_clf.predict(test_vectors)
predicted_your_sentence_location = lr_clf.predict(your_sentence_vector)

CPU times: user 9.58 ms, sys: 2.22 ms, total: 11.8 ms
Wall time: 10.7 ms


In [48]:
print("Test set accuracy: "\
      + str(accuracy_score(test_locations, predicted_test_locations)))

Test set accuracy: 0.4748


In [49]:
print("Result of your sentence: " + str(predicted_your_sentence_location))

Result of your sentence: ['SE']


# Analysis

In [50]:
# Find Top Weights in Logistic Regression Classifier
theta_NE = lr_clf.coef_[0]
theta_NW = lr_clf.coef_[1]
theta_SE = lr_clf.coef_[2]
theta_SW = lr_clf.coef_[3]
weights_NE = dict()
weights_NW = dict()
weights_SE = dict()
weights_SW = dict()

for feature, weight in zip(vector_name_list, theta_NE):
    weights_NE[feature] = weight
for feature, weight in zip(vector_name_list, theta_NW):
    weights_NW[feature] = weight
for feature, weight in zip(vector_name_list, theta_SE):
    weights_SE[feature] = weight
for feature, weight in zip(vector_name_list, theta_SW):
    weights_SW[feature] = weight

In [51]:
print("Top Scoring Logistic Regression Weights\n")
top_features_NE = sorted(weights_NE, key=lambda x:weights_NE[x], reverse=True)[:10]
for word in top_features_NE:
    print(str(word) + '\t\tNE\t\t' + str(weights_NE[word]))

print('\n')
print('################################')
print('\n')

top_features_NW = sorted(weights_NW, key=lambda x:weights_NW[x], reverse=True)[:10]
for word in top_features_NW:
    print(str(word) + '\t\tNW\t\t' + str(weights_NW[word]))

print('\n')
print('################################')
print('\n')

top_features_SE = sorted(weights_SE, key=lambda x:weights_SE[x], reverse=True)[:10]
for word in top_features_SE:
    print(str(word) + '\t\tSE\t\t' + str(weights_SE[word]))

print('\n')
print('################################')
print('\n')

top_features_SW = sorted(weights_SW, key=lambda x:weights_SW[x], reverse=True)[:10]
for word in top_features_SW:
    print(str(word) + '\t\tSW\t\t' + str(weights_SW[word]))

Top Scoring Logistic Regression Weights

#toronto		NE		2.90294848331
il		NE		2.79604968164
#nyc		NE		2.74393936549
@ashcrofttom		NE		2.73272379919
@benpage11benp		NE		2.67721946977
nj		NE		2.67596764088
@thsbluedevils		NE		2.65420123949
#isles		NE		2.61687329474
#wvprepfb		NE		2.61157786911
mn		NE		2.46361035804


################################


#opreps		NW		3.40653149571
#sanfrancisco		NW		3.28263404866
#copreps		NW		3.2588894379
#portland		NW		3.13328463373
#seattle		NW		3.10631816
https://t.co/gxloesa9oo		NW		2.87680990938
#norcalscores		NW		2.80602466871
@cloacamaxima01		NW		2.64797156408
#mtscores		NW		2.63647972689
#goducks		NW		2.58045337141


################################


@karn33333		SE		3.07310910782
@negrosubversive		SE		2.90852836141
#tcprepzone		SE		2.73283463734
jacksonville		SE		2.66601170378
#ghc16		SE		2.62148805616
#cubevenue		SE		2.57452245442
tx		SE		2.54129372599
#wstc		SE		2.50645560027
@__kissesileft		SE		2.48434002082
#florida		SE		2.48276009965


#######

Find more in Chapter 1 and 2 of the book below
https://github.com/jacobeisenstein/gt-nlp-class/blob/master/notes/eisenstein-nlp-notes.pdf