# Classifying tweets to predict location

In this notebook, we will use a NB Classifier in order to predict if a tweet comes from New York, London or Paris.

In [4]:
import pandas as pd
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [9]:
new_york_tweets = pd.read_json('new_york.json', lines=True)
london_tweets = pd.read_json('london.json', lines=True)
paris_tweets = pd.read_json('paris.json', lines=True)

In [6]:
len(new_york_tweets)

4723

In [7]:
new_york_tweets.columns

Index(['created_at', 'id', 'id_str', 'text', 'display_text_range', 'source',
       'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'is_quote_status', 'quote_count', 'reply_count',
       'retweet_count', 'favorite_count', 'entities', 'favorited', 'retweeted',
       'filter_level', 'lang', 'timestamp_ms', 'extended_tweet',
       'possibly_sensitive', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status', 'quoted_status_permalink', 'extended_entities',
       'withheld_in_countries'],
      dtype='object')

In [8]:
new_york_tweets.loc[0]['text']

'@DelgadoforNY19 Calendar marked.'

In [15]:
new_york_tweets.loc[0]['place']

{'id': '01a9a39529b27f36',
 'url': 'https://api.twitter.com/1.1/geo/id/01a9a39529b27f36.json',
 'place_type': 'city',
 'name': 'Manhattan',
 'full_name': 'Manhattan, NY',
 'country_code': 'US',
 'country': 'United States',
 'bounding_box': {'type': 'Polygon',
  'coordinates': [[[-74.026675, 40.683935],
    [-74.026675, 40.877483],
    [-73.910408, 40.877483],
    [-73.910408, 40.683935]]]},
 'attributes': {}}

## Classifying using language

In [16]:
all_tweets_text = new_york_tweets['text'].tolist() + london_tweets['text'].tolist() + paris_tweets['text'].tolist()
labels = [0] * len(new_york_tweets) + [1] * len(london_tweets) + [2] * len(paris_tweets)

In [11]:
len(all_tweets_text)

12574

## Splitting up the data

In [19]:
train_data, test_data, train_labels, test_labels = train_test_split(all_tweets_text, labels, test_size=0.2, random_state=1)

## Making count vectors

In [20]:
counter = CountVectorizer()

In [21]:
counter.fit(train_data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [23]:
train_counts = counter.transform(train_data)

In [24]:
test_counts = counter.transform(test_data)

In [25]:
train_data[3]

'saying bye is hard. Especially when youre saying bye to comfort.'

In [27]:
print(train_counts[3])

  (0, 5022)	2
  (0, 6371)	1
  (0, 9552)	1
  (0, 12314)	1
  (0, 13903)	1
  (0, 23994)	2
  (0, 27146)	1
  (0, 29397)	1
  (0, 30274)	1


## Training and testing the NB Classifier

In [28]:
classifier = MultinomialNB()

In [30]:
classifier.fit(train_counts, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
predictions = classifier.predict(test_counts)

## Model evaluation

In [34]:
accuracy_score(test_labels, predictions)

0.6779324055666004

In [41]:
confusionMatrix = confusion_matrix(test_labels, predictions)
confusionMatrix

array([[541, 404,  28],
       [203, 824,  34],
       [ 38, 103, 340]])

## Testing tweets

In [45]:
tweet_fr = 'ceci est un tweet francais'
tweet_us = 'coming from new york, this is an american tweet'
tweet_uk = 'You twat! Do you really think we need that with our tea?'

In [47]:
tweet_fr_counts = counter.transform([tweet_fr])
tweet_us_counts = counter.transform([tweet_us])
tweet_uk_counts = counter.transform([tweet_uk])

In [49]:
print(classifier.predict(tweet_fr_counts))
print(classifier.predict(tweet_us_counts))
print(classifier.predict(tweet_uk_counts))

[2]
[0]
[1]
