In [1]:
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocessing

In [2]:
%%time
db = MongoClient('mongodb://143.215.138.132:27017')['big_data']

matchNE = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -99, '$lte': -69}}}
matchSE = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -99, '$lte': -69}}}
matchNW = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -125, '$lte': -99}}}
matchSW = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -125, '$lte': -99}}}

sentence_list = []
location_list = []

limit = {'$limit': 10000}

pipeline = [matchNE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NE')

pipeline = [matchSE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SE')

pipeline = [matchNW, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NW')

pipeline = [matchSW, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SW')

CPU times: user 1.22 s, sys: 962 ms, total: 2.18 s
Wall time: 26.9 s


In [3]:
# Add your own sentence here
your_sentence = "It's snowing in New York!"
sentence_list.append(your_sentence)

In [4]:
class Tweet_Tokenizer(object):
    def __init__(self):
        self.wnl = TweetTokenizer()
    def __call__(self, doc):
        return self.wnl.tokenize(doc)

from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import dok_matrix, hstack
from nltk.tokenize import TweetTokenizer
import numpy as np    

def make_features(corpus):
#     vectorizer = CountVectorizer(tokenizer=Tweet_Tokenizer(), analyzer='word', min_df=0)
#     return vectorizer.fit_transform(corpus), vectorizer.get_feature_names()

    # Basic BOW
    vectorizer = CountVectorizer(tokenizer=Tweet_Tokenizer(), analyzer='word', min_df=2)
    X_BOW = vectorizer.fit_transform(corpus)

    # Additional Features (need to add more)
    NUM_OF_FEATS = 1
    X_ADD = dok_matrix((len(corpus), NUM_OF_FEATS))
    tt = Tweet_Tokenizer()

    # Find Length Percentage
    num_token_list = np.array([len(tt.__call__(text)) for text in corpus])
    num_token_list = np.argsort(num_token_list)
    length_percentage_dict = {num_token_list[i]: i * 1.0 / len(num_token_list) for i in range(len(num_token_list))}

    for i in range(len(corpus)):
        X_ADD[i, 0] = length_percentage_dict[i]

    # Concatenate
    X = hstack([X_BOW, X_ADD])

    return X, vectorizer.get_feature_names() + ['<length>']

In [5]:
%%time
sentence_vector_list, vector_name_list = make_features(sentence_list)
your_sentence_vector = sentence_vector_list[-1]
sentence_vector_list = sentence_vector_list[:-1]

TypeError: 'coo_matrix' object does not support indexing

# Training

In [38]:
training_vectors, test_vectors, training_locations, test_locations =\
    train_test_split(sentence_vector_list, location_list, test_size=0.1, random_state=999)

In [39]:
lr_clf = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=5000)

In [40]:
%%time
lr_clf.fit(training_vectors, training_locations)

CPU times: user 41.6 s, sys: 75.9 ms, total: 41.7 s
Wall time: 41.7 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

# Results

In [41]:
%%time
predicted_test_locations = lr_clf.predict(test_vectors)
predicted_your_sentence_location = lr_clf.predict(your_sentence_vector)

CPU times: user 1.76 ms, sys: 734 µs, total: 2.5 ms
Wall time: 1.33 ms


In [42]:
print("Test set accuracy: "\
      + str(accuracy_score(test_locations, predicted_test_locations)))

Test set accuracy: 0.4035


In [43]:
print("Result of your sentence: " + str(predicted_your_sentence_location))

Result of your sentence: ['NW']


# Analysis

In [44]:
# Find Top Weights in Logistic Regression Classifier
theta_NE = lr_clf.coef_[0]
theta_NW = lr_clf.coef_[1]
theta_SE = lr_clf.coef_[2]
theta_SW = lr_clf.coef_[3]
weights_NE = dict()
weights_NW = dict()
weights_SE = dict()
weights_SW = dict()

for feature, weight in zip(vector_name_list, theta_NE):
    weights_NE[feature] = weight
for feature, weight in zip(vector_name_list, theta_NW):
    weights_NW[feature] = weight
for feature, weight in zip(vector_name_list, theta_SE):
    weights_SE[feature] = weight
for feature, weight in zip(vector_name_list, theta_SW):
    weights_SW[feature] = weight

In [45]:
print("Top Scoring Logistic Regression Weights\n")
top_features_NE = sorted(weights_NE, key=lambda x:weights_NE[x], reverse=True)[:10]
for word in top_features_NE:
    print(str(word) + '\t\tNE\t\t' + str(weights_NE[word]))

print('\n')
print('################################')
print('\n')

top_features_NW = sorted(weights_NW, key=lambda x:weights_NW[x], reverse=True)[:10]
for word in top_features_NW:
    print(str(word) + '\t\tNW\t\t' + str(weights_NW[word]))

print('\n')
print('################################')
print('\n')

top_features_SE = sorted(weights_SE, key=lambda x:weights_SE[x], reverse=True)[:10]
for word in top_features_SE:
    print(str(word) + '\t\tSE\t\t' + str(weights_SE[word]))

print('\n')
print('################################')
print('\n')

top_features_SW = sorted(weights_SW, key=lambda x:weights_SW[x], reverse=True)[:10]
for word in top_features_SW:
    print(str(word) + '\t\tSW\t\t' + str(weights_SW[word]))

Top Scoring Logistic Regression Weights

ny		NE		1.71115443298
#nyc		NE		1.63361423885
#specialreport		NE		1.61197706748
toronto		NE		1.59365915398
ebay		NE		1.55748699816
ohio		NE		1.52965782532
@bottomoso		NE		1.51828566167
@steff__r		NE		1.50676301868
detroit		NE		1.48532423636
@nicoleee1_		NE		1.46714299836


################################


@cloacamaxima01		NW		2.22805003937
portland		NW		1.95345939666
@russia		NW		1.84209398054
seattle		NW		1.8303613595
bay		NW		1.72858493205
colorado		NW		1.64933731643
francisco		NW		1.61359340601
#job		NW		1.60769815923
wa		NW		1.59610434078
utah		NW		1.53873501498


################################


@gibson326		SE		2.09738893804
fl		SE		2.08730570408
alabama		SE		2.01769496561
@taylorswift13		SE		1.91531519041
@negrosubversive		SE		1.82964847525
@mycfe		SE		1.70256018502
tx		SE		1.59764313345
orleans		SE		1.57303915224
#ghc16		SE		1.55480352796
butter		SE		1.41978836958


################################


az		SW		2.14579573694
ca		SW		2.08

Find more in Chapter 1 and 2 of the book below
https://github.com/jacobeisenstein/gt-nlp-class/blob/master/notes/eisenstein-nlp-notes.pdf