In [1]:
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocessing

In [2]:
%%time
db = MongoClient('mongodb://143.215.138.132:27017')['big_data']

matchNE = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -99, '$lte': -69}}}
matchSE = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -99, '$lte': -69}}}
matchNW = {'$match': {'lat': {'$gte': 36, '$lte': 50}, 'lon': {'$gte': -125, '$lte': -99}}}
matchSW = {'$match': {'lat': {'$gte': 25, '$lte': 36}, 'lon': {'$gte': -125, '$lte': -99}}}

sentence_list = []
location_list = []

limit = {'$limit': 10000}

pipeline = [matchNE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NE')

pipeline = [matchSE, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SE')

pipeline = [matchNW, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('NW')

pipeline = [matchSW, limit]

for tweet in db.tweet.aggregate(pipeline):
    sentence_list.append(tweet['text'])
    location_list.append('SW')
    
# Add your own sentence here
your_sentence = "It's snowing in New York!"
sentence_list.append(your_sentence)

CPU times: user 1.35 s, sys: 1.08 s, total: 2.42 s
Wall time: 5.78 s


In [None]:
class Tweet_Tokenizer(object):
    def __init__(self):
        self.wnl = TweetTokenizer()
    def __call__(self, doc):
        return self.wnl.tokenize(doc)

### Possible extra features
1. Use hashtags and mentions instead of the whole sentence
2. Try Bi-gram or tri-gram
3. Measure the formality of language

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import dok_matrix, hstack
from nltk.tokenize import TweetTokenizer
import numpy as np

def make_features(corpus):
#     vectorizer = CountVectorizer(tokenizer=Tweet_Tokenizer(), analyzer='word', min_df=0)
#     return vectorizer.fit_transform(corpus), vectorizer.get_feature_names()

    # Basic BOW
    vectorizer = CountVectorizer(tokenizer=Tweet_Tokenizer(), analyzer='word', min_df=2)
    X_BOW = vectorizer.fit_transform(corpus)

    # Additional Features (need to add more)
    NUM_OF_FEATS = 1
    X_ADD = dok_matrix((len(corpus), NUM_OF_FEATS))
    tt = Tweet_Tokenizer()

    # Find Length Percentage
    num_token_list = np.array([len(tt.__call__(text)) for text in corpus])
    num_token_list = np.argsort(num_token_list)
    length_percentage_dict = {num_token_list[i]: i * 1.0 / len(num_token_list) for i in range(len(num_token_list))}

    for i in range(len(corpus)):
        X_ADD[i, 0] = length_percentage_dict[i]

    # Concatenate
    X = hstack([X_BOW, X_ADD])

    return X, vectorizer.get_feature_names() + ['<length>']

In [9]:
%%time

sentence_vector_list, vector_name_list = make_features(sentence_list)

print(type(sentence_vector_list))

# coo_matrix doesn't directly support slicing, so change to csr_matrix first
sentence_vector_list = sentence_vector_list.tocsr()

print(type(sentence_vector_list))

your_sentence_vector = sentence_vector_list[-1]
sentence_vector_list = sentence_vector_list[:-1]

<class 'scipy.sparse.coo.coo_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
CPU times: user 4.62 s, sys: 35.4 ms, total: 4.65 s
Wall time: 4.66 s


# Training

In [10]:
training_vectors, test_vectors, training_locations, test_locations =\
    train_test_split(sentence_vector_list, location_list, test_size=0.1, random_state=999)

In [11]:
lr_clf = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=5000)

In [12]:
%%time
lr_clf.fit(training_vectors, training_locations)

CPU times: user 20.9 s, sys: 33 ms, total: 20.9 s
Wall time: 20.9 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

# Results

In [13]:
%%time
predicted_test_locations = lr_clf.predict(test_vectors)
predicted_your_sentence_location = lr_clf.predict(your_sentence_vector)

CPU times: user 1.65 ms, sys: 1.73 ms, total: 3.38 ms
Wall time: 3.67 ms


In [14]:
print("Test set accuracy: "\
      + str(accuracy_score(test_locations, predicted_test_locations)))

Test set accuracy: 0.407


In [15]:
print("Result of your sentence: " + str(predicted_your_sentence_location))

Result of your sentence: ['NE']


# Analysis

In [16]:
# Find Top Weights in Logistic Regression Classifier
theta_NE = lr_clf.coef_[0]
theta_NW = lr_clf.coef_[1]
theta_SE = lr_clf.coef_[2]
theta_SW = lr_clf.coef_[3]
weights_NE = dict()
weights_NW = dict()
weights_SE = dict()
weights_SW = dict()

for feature, weight in zip(vector_name_list, theta_NE):
    weights_NE[feature] = weight
for feature, weight in zip(vector_name_list, theta_NW):
    weights_NW[feature] = weight
for feature, weight in zip(vector_name_list, theta_SE):
    weights_SE[feature] = weight
for feature, weight in zip(vector_name_list, theta_SW):
    weights_SW[feature] = weight

In [17]:
print("Top Scoring Logistic Regression Weights\n")
top_features_NE = sorted(weights_NE, key=lambda x:weights_NE[x], reverse=True)[:10]
for word in top_features_NE:
    print(str(word) + '\t\tNE\t\t' + str(weights_NE[word]))

print('\n')
print('################################')
print('\n')

top_features_NW = sorted(weights_NW, key=lambda x:weights_NW[x], reverse=True)[:10]
for word in top_features_NW:
    print(str(word) + '\t\tNW\t\t' + str(weights_NW[word]))

print('\n')
print('################################')
print('\n')

top_features_SE = sorted(weights_SE, key=lambda x:weights_SE[x], reverse=True)[:10]
for word in top_features_SE:
    print(str(word) + '\t\tSE\t\t' + str(weights_SE[word]))

print('\n')
print('################################')
print('\n')

top_features_SW = sorted(weights_SW, key=lambda x:weights_SW[x], reverse=True)[:10]
for word in top_features_SW:
    print(str(word) + '\t\tSW\t\t' + str(weights_SW[word]))

Top Scoring Logistic Regression Weights

#nyc		NE		1.85779817614
#specialreport		NE		1.85539005115
ebay		NE		1.76209599626
ny		NE		1.74850212026
ohio		NE		1.72863709174
@bottomoso		NE		1.70721632819
#ictfl16		NE		1.69661452756
nj		NE		1.68112004575
toronto		NE		1.66831803083
@steff__r		NE		1.62920485587


################################


@cloacamaxima01		NW		2.31485372902
portland		NW		2.0079116999
@russia		NW		1.89330674312
seattle		NW		1.85715166493
utah		NW		1.71289058186
colorado		NW		1.69212637619
nv		NW		1.68236195248
bay		NW		1.66897650202
francisco		NW		1.66698843905
stanford		NW		1.65136205396


################################


@gibson326		SE		2.24946795864
fl		SE		2.21889621671
@taylorswift13		SE		2.11677551163
alabama		SE		2.10656323013
@negrosubversive		SE		1.91848620291
@mycfe		SE		1.76701420979
orleans		SE		1.74811972502
#ghc16		SE		1.74038570243
tx		SE		1.6961710089
butter		SE		1.62572447488


################################


az		SW		2.13486938964
@im_beyondgreat		

Find more in Chapter 1 and 2 of the book below
https://github.com/jacobeisenstein/gt-nlp-class/blob/master/notes/eisenstein-nlp-notes.pdf