# Viral tweets

Using a K-Nearest Neighbor algorithm to predict if a tweet will go viral.

In [96]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [97]:
all_tweets = pd.read_json('random_tweets.json', lines=True)

In [98]:
len(all_tweets)

11099

In [99]:
all_tweets.columns

Index(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities',
       'metadata', 'source', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
       'coordinates', 'place', 'contributors', 'retweeted_status',
       'is_quote_status', 'retweet_count', 'favorite_count', 'favorited',
       'retweeted', 'lang', 'possibly_sensitive', 'quoted_status_id',
       'quoted_status_id_str', 'extended_entities', 'quoted_status',
       'withheld_in_countries'],
      dtype='object')

In [100]:
all_tweets.loc[0]['user']

{'id': 145388018,
 'id_str': '145388018',
 'name': 'Derek Wolkenhauer',
 'screen_name': 'derekw221',
 'location': 'Waterloo, Iowa',
 'description': '',
 'url': None,
 'entities': {'description': {'urls': []}},
 'protected': False,
 'followers_count': 215,
 'friends_count': 335,
 'listed_count': 2,
 'created_at': 'Tue May 18 21:30:10 +0000 2010',
 'favourites_count': 3419,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': False,
 'statuses_count': 4475,
 'lang': 'en',
 'contributors_enabled': False,
 'is_translator': False,
 'is_translation_enabled': False,
 'profile_background_color': '022330',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme15/bg.png',
 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme15/bg.png',
 'profile_background_tile': False,
 'profile_image_url': 'http://pbs.twimg.com/profile_images/995790590276243456/cgxRVviN_normal.jpg',
 'profile_image_url_https': 'https://pbs.twimg.com/profile

### How to define viral tweets

For a K-Nearest Neighbor classification algorithm, we need a dataset with tagged labels. In this case, if we want to tag the tweet as viral, as there is no 'viral' label, we need to analyse the data, and find the feature(s) most likely to be linked to a viral status.

In [101]:
all_tweets['retweet_count'].describe()

count     11099.000000
mean       2777.956392
std       12180.169923
min           0.000000
25%           0.000000
50%          13.000000
75%         428.500000
max      413719.000000
Name: retweet_count, dtype: float64

In [102]:
all_tweets['retweet_count'].median()

13.0

In [103]:
all_tweets['viral_tweet'] = np.where(all_tweets['retweet_count'] > all_tweets['retweet_count'].median(), 1, 0)

In [104]:
all_tweets['viral_tweet'].value_counts()

0    5562
1    5537
Name: viral_tweet, dtype: int64

In [105]:
all_tweets[['text', 'retweet_count', 'viral_tweet']].head(10)

Unnamed: 0,text,retweet_count,viral_tweet
0,RT @KWWLStormTrack7: We are more than a month ...,3,0
1,@hail_ee23 Thanks love its just the feeling of...,0,0
2,RT @TransMediaWatch: Pink News has more on the...,5,0
3,RT @realDonaldTrump: One of the reasons we nee...,11106,1
4,RT @First5App: This hearing of His Word doesn’...,6,0
5,RT @attackerman: This is torture: “The staff t...,195,1
6,Did a demo of our Mobile Prototyping Kit at UX...,0,0
7,RT @itstae13: Stop getting rid of your pets be...,162420,1
8,RT @RealErinCruz: Someone sent me a thought pr...,446,1
9,RT @penis_hernandez: when you ask how a white ...,14411,1


## Making features

We cannot rely only on the number of retweet to define if a tweet is going to be viral. We need to find other features. After analysing the data, there are a couple of options we could use:
- the followers_count
- the friends_count

In [106]:
all_tweets['followers_count'] = all_tweets.apply(lambda tweet: tweet['user']['followers_count'], axis=1)

In [107]:
all_tweets['followers_count']

0         215
1         199
2         196
3        3313
4         125
         ... 
11094     509
11095     462
11096     135
11097      59
11098    1563
Name: followers_count, Length: 11099, dtype: int64

In [108]:
all_tweets['friends_count'] = all_tweets.apply(lambda tweet: tweet['user']['friends_count'], axis=1)

In [109]:
all_tweets['friends_count']

0         335
1         203
2         558
3        2272
4         273
         ... 
11094    1323
11095    1033
11096      90
11097     320
11098    1697
Name: friends_count, Length: 11099, dtype: int64

In [110]:
all_tweets['tweet_length'] = all_tweets.apply(lambda tweet: len(tweet['text']), axis=1)

In [111]:
all_tweets['tweet_length']

0        140
1         77
2        140
3        140
4        140
        ... 
11094    140
11095     75
11096    140
11097    140
11098     75
Name: tweet_length, Length: 11099, dtype: int64

In [112]:
data = all_tweets[['tweet_length', 'followers_count', 'friends_count']]
data

Unnamed: 0,tweet_length,followers_count,friends_count
0,140,215,335
1,77,199,203
2,140,196,558
3,140,3313,2272
4,140,125,273
...,...,...,...
11094,140,509,1323
11095,75,462,1033
11096,140,135,90
11097,140,59,320


In [113]:
labels = all_tweets['viral_tweet']

## Normalizing the data

In [114]:
scaled_data = scale(data, axis=0)

In [115]:
scaled_data

array([[ 0.6164054 , -0.02878298, -0.14483305],
       [-1.64577622, -0.02886246, -0.16209787],
       [ 0.6164054 , -0.02887736, -0.11566596],
       ...,
       [ 0.6164054 , -0.02918038, -0.1768776 ],
       [ 0.6164054 , -0.02955792, -0.14679496],
       [-1.71759151, -0.02208668,  0.0333085 ]])

## Splitting the data

In [116]:
train_data, test_data, train_labels, test_labels = train_test_split(scaled_data, labels, test_size=0.2, random_state = 1)

## Using the classifier

In [117]:
classifier = KNeighborsClassifier(n_neighbors=5)

In [118]:
classifier.fit(train_data, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [119]:
classifier.score(test_data, test_labels)

0.5882882882882883

## Choosing K

In [120]:
def listScores(numNeighbors):
    scores = []
    for i in range(1, numNeighbors):
        classifier = KNeighborsClassifier(n_neighbors=i)
        classifier.fit(train_data, train_labels)
        scores.append(classifier.score(test_data, test_labels))
    return scores

In [121]:
scores = listScores(500)

In [122]:
maxScore = np.empty(500)
maxScore[:] = np.NaN
maxScore[np.where(scores==np.amax(scores))] = np.amax(scores)

In [123]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(1, 500), y=scores, name='Scores'))
fig.add_trace(go.Scatter(x=np.arange(1, 500), y=maxScore, name='Max Score', mode='markers', marker_size=10))
fig.update_layout(title='Scores')
fig.show()

## Using the classifier with the best K value

In [124]:
classifier = KNeighborsClassifier(41)

In [125]:
classifier.fit(train_data, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=41, p=2,
                     weights='uniform')

In [126]:
classifier.score(test_data, test_labels)

0.6225225225225225