In [1]:
import os
import re

from kinnara.gather.twitter import TwitterApiWrapper

In [2]:
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
        level=logging.INFO)
logger = logging.getLogger(__name__)

#### read in api keys and access tokens

In [3]:
api_key = os.getenv('API_KEY')
api_secret = os.getenv('API_SECRET')

access_token = os.getenv('ACCESS_TOKEN')
access_token_secret = os.getenv('ACCESS_TOKEN_SECRET')

## Gather public figure tweets

In [4]:
# create kinnara gatherer
twitter_wrapper = TwitterApiWrapper(api_key=api_key, api_secret=api_secret,
                        access_token=access_token, access_token_secret=access_token_secret)

In [5]:
screen_names = [
    'BarackObama',
    'realDonaldTrump',
    'KimKardashian',
    'BillGates',
    'Oprah',
    'justinbieber',
    'Beyonce',
    'TheRock'
]

In [6]:
users = []
for screen_name in screen_names:
    users.append(twitter_wrapper.get_user(screen_name))

2018-08-24 22:17:46,695 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:17:46,868 - INFO - Starting new HTTPS connection (1): api.twitter.com


BarackObama
realDonaldTrump


2018-08-24 22:17:47,039 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:17:47,192 - INFO - Starting new HTTPS connection (1): api.twitter.com


KimKardashian
BillGates


2018-08-24 22:17:47,374 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:17:47,515 - INFO - Starting new HTTPS connection (1): api.twitter.com


Oprah
justinbieber


2018-08-24 22:17:47,666 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:17:47,811 - INFO - Starting new HTTPS connection (1): api.twitter.com


Beyonce
TheRock


In [8]:
# lets grab 400 tweets from each twitter users timeline
for user in users:
    user['tweets'] = twitter_wrapper.get_tweets(user['id_str'], max_tweets_returned=400)

2018-08-24 22:18:04,257 - INFO - getting tweets for 813286
2018-08-24 22:18:04,260 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:18:04,658 - INFO - getting tweets for 813286
2018-08-24 22:18:04,661 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:18:05,022 - INFO - getting tweets for 25073877
2018-08-24 22:18:05,024 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:18:05,525 - INFO - getting tweets for 25073877
2018-08-24 22:18:05,528 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:18:05,875 - INFO - getting tweets for 25365536
2018-08-24 22:18:05,878 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:18:06,322 - INFO - getting tweets for 25365536
2018-08-24 22:18:06,324 - INFO - Starting new HTTPS connection (1): api.twitter.com
2018-08-24 22:18:06,800 - INFO - getting tweets for 50393960
2018-08-24 22:18:06,803 - INFO - Starting new HTTPS connection (1): api.t

In [9]:
def clean_tweet(tweet_text):
    '''clean up tweet text'''
    return re.sub(r' ?https[^ ]*', r'', tweet_text)

In [10]:
screen_name_to_tweets = {}
for user in users:
    screen_name_to_tweets[user['screen_name']] = [clean_tweet(tweet.get('full_text', ''))
                                                  for tweet in user['tweets']
                                                  if re.match(r'^RT @', tweet['full_text']) is None]

In [12]:
# save it for later
f = open('/home/ubuntu/data/twitter/celebrity_tweet_data.csv', 'w')

for screen_name, tweets in screen_name_to_tweets.items():
    for tweet in tweets:
        f.write(f'{screen_name}, "{tweet}"\n')
f.close()

In [92]:
screen_names = []
tweets = []

for k, ts in screen_name_to_tweets.items():
    for tweet in ts:
        screen_names.append(k)
        tweets.append(tweet)

celebrity_df = pd.DataFrame.from_dict({'screen_names': screen_names, 'tweets': tweets})


In [93]:
# save it for later
celebrity_df.to_csv('/home/ubuntu/data/twitter/celebrity_tweets.csv', header=False, index=False, encoding='utf-8')

In [94]:
# also save one with 0s instead of screen names, will make sense later
zs = np.zeros(len(screen_names), dtype=int)
zs[-1] = 1
celebrity_df_ints = pd.DataFrame.from_dict({'screen_names': zs, 'tweets': tweets})

celebrity_df_ints.to_csv('/home/ubuntu/data/twitter/celebrity_tweets_ints.csv', header=False, index=False, encoding='utf-8')

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Model

This portion of the notebook borrows extensively from fast.ai's notebook for lesson 10 in their deep learning course part two.

In [13]:
import collections
import html

import numpy as np
import pandas as pd

from fastai.text import *

  from numpy.core.umath_tests import inner1d


Define the directories we'll be using

You'll need to download tweet sentiment data from kaggle - https://www.kaggle.com/c/twitter-sentiment-analysis2/data - and put it in the twitter/orignal directory

In [14]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

# path to our original tweet data from kaggle
PATH = '/home/ubuntu/data/twitter/original'

In [15]:
# path to our classification data
CLASSIFICATION_PATH = '/home/ubuntu/data/twitter/twitter_classification/'

# path to our language model
LANGUAGE_MODEL_PATH = '/home/ubuntu/data/twitter/twitter_language_model/'

start prepping our tweet data from kaggle

In [16]:
CLASSES = ['neg', 'pos']

In [None]:
df = pd.read_csv(PATH + '/train.csv', encoding='latin1')

# shuffle rows and throw out item id column
df = df.drop('ItemID', axis=1)
df = df.sample(frac=1)

# rename columns
df.columns = ['labels', 'text']

# split datafram into train and validation sets for later
split_index = int(df.shape[0] * .9)
train_df, test_df = np.split(df, [split_index], axis=0)

saving our current progress

In [None]:
# save to classification directory
train_df.to_csv(CLASSIFICATION_PATH + '/train.csv', header=False, index=False, encoding='utf-8')
test_df.to_csv(CLASSIFICATION_PATH + '/test.csv', header=False, index=False, encoding='utf-8')

f = open(CLASSIFICATION_PATH + '/classes.txt', 'w', encoding='utf-8')
f.writelines(f'{c}\n' for c in CLASSES)
f.close()

# save to language model directory
# we should be adding in the test.csv from the kaggle competition here because the language model doesn't care about labels
# but in the interest of time I'm opting to just use the training set for the language model
train_df.to_csv(LANGUAGE_MODEL_PATH + '/train.csv', header=False, index=False, encoding='utf-8')
test_df.to_csv(LANGUAGE_MODEL_PATH + '/test.csv', header=False, index=False, encoding='utf-8')

lets start cleaning some text

In [34]:
# chunksize for pandas so it doesn't run into any memory limits
chunksize=24000

In [None]:
df.shape

In [75]:
## functions pulled from the fast.ai notebook for text tokenization

re1 = re.compile(r'  +')

def fixup(x):
    '''some patterns identified by the fast.ai folks that spacy doesn't account for'''
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = list(texts.apply(fixup).values)

    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

def get_all(df, n_lbls):
    '''tokenize the text'''
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i, n_lbls)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [None]:
# grab our dataframes from earlier
train_df = pd.read_csv(LANGUAGE_MODEL_PATH + '/train.csv', header=None, chunksize=chunksize)
val_df = pd.read_csv(LANGUAGE_MODEL_PATH + '/test.csv', header=None, chunksize=chunksize)

In [None]:
# tokenize the tweets
train_tokens, train_labels = get_all(train_df, 1)
val_tokens, val_labels = get_all(val_df, 1)

In [None]:
#make temporary directory
os.mkdir(LANGUAGE_MODEL_PATH + '/tmp')

In [None]:
# save our tokens
np.save(LANGUAGE_MODEL_PATH + '/tmp/tok_trn.npy', train_tokens)
np.save(LANGUAGE_MODEL_PATH + '/tmp/tok_val.npy', val_tokens)

In [None]:
# load back in
train_tokens = np.load(LANGUAGE_MODEL_PATH + '/tmp/tok_trn.npy')
val_tokens = np.load(LANGUAGE_MODEL_PATH + '/tmp/tok_val.npy')

In [None]:
# lets take a look at our most common tokens
freq = Counter(token for tokens in train_tokens for token in tokens)
freq.most_common(25)

looks about right.  quick note to keep in mind - the "t_up" token isn't in the text its self, it is a marker indicating the following token is all uppercase.

fast.ai recommends that you only keep the 60,000 or so most common tokens. Reason being low frequency tokens don't help you learn a lot about a language.

We don't have that many tokens for our tweets, but it makes me feel good to put in anyways ...

In [None]:
max_vocab = 60000
min_freq = 2

int_to_token = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
int_to_token.insert(0, '_pad_')
int_to_token.insert(0, '_unk_')

In [None]:
token_to_int = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(int_to_token)})
len(int_to_token)

In [None]:
train_lm = np.array([[token_to_int[o] for o in p] for p in train_tokens])
val_lm = np.array([[token_to_int[o] for o in p] for p in val_tokens])

In [None]:
# saving our progress
np.save(LANGUAGE_MODEL_PATH + '/tmp/trn_ids.npy', train_lm)
np.save(LANGUAGE_MODEL_PATH + '/tmp/val_ids.npy', val_lm)

pickle.dump(int_to_token, open(LANGUAGE_MODEL_PATH + '/tmp/itos.pkl', 'wb'))

In [None]:
# loading back in
train_lm = np.load(LANGUAGE_MODEL_PATH + '/tmp/trn_ids.npy')
val_lm = np.load(LANGUAGE_MODEL_PATH + '/tmp/val_ids.npy')
int_to_token = pickle.load(open(LANGUAGE_MODEL_PATH + '/tmp/itos.pkl', 'rb'))

In [None]:
num_twitter_tokens = len(int_to_token)
num_twitter_tokens, len(train_lm)

### load in a pretrained language model trained on wikipedia text

run this line to download wikipedia model

In [None]:
# ! wget -nH -r -np -P {PATH} http://files.fast.ai/models/wt103/

In [None]:
# some stats from the wikepedia model
embedding_size, num_hidden, num_layers = 400,1150,3

In [None]:
PRE_PATH = '/home/ubuntu/data/twitter/original/models/wt103/'
PRE_LM_PATH = PRE_PATH + '/fwd_wt103.h5'

In [None]:
# grab the weights from the encoder
weights = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

The mean of the weights from layer 0 can be used to assign weights to tokens that exist in the wikipedia dataset but not in the twitter dataset

In [None]:
encoder_weights = to_np(weights['0.encoder.weight'])
# row_m = enc_wgts.mean(0)
encoder_mean = encoder_weights.mean(0)

In [None]:
wiki_int_to_token = pickle.load(open(PRE_PATH + '/itos_wt103.pkl', 'rb'))
wiki_token_to_int = collections.defaultdict(lambda: -1, {v:k for k, v in enumerate(wiki_int_to_token)})

We need to assign mean weights to tokens that exist in our twitter dataset that dont in the wikipedia dataset the pretrained model was trained on.

In [None]:
new_weights = np.zeros((num_twitter_tokens, embedding_size), dtype=np.float32)
for i, w in enumerate(int_to_token):
    r = wiki_token_to_int[w]
    new_weights[i] = encoder_weights[r] if r >= 0 else encoder_mean

We now need to put the new weights into the pretrained model

The weights between the encoder and decoder also need to be tied together

In [None]:
weights['0.encoder.weight'] = T(new_weights)
weights['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_weights))
weights['1.decoder.weight'] = T(np.copy(new_weights))

### Retraining the wikipedia language model

In [None]:
wd=1e-7 # weight decay
bptt=70 # ngram size.  i.e. the model sees ~70 tokens and then tries to predict the 71st
bs=52 # batch size
opt_fn = partial(optim.Adam, betas=(0.8, 0.99)) # optimazation function

Here we define a special fastai data loader, the `LanguageModelLoader`, to feed the training data into the model whilst training.

We can then use those to instanciate a `LanguageModelData` class that returns a fastai model we can train

In [None]:
train_dl = LanguageModelLoader(np.concatenate(train_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)

md = LanguageModelData(PATH, 1, vs, train_dl, val_dl, bs=bs, bptt=bptt)

In [None]:
# the dropouts for each layer.
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

The last embedding layer needs to be tuned first so the new weights we set for the pretrained model get tuned properly.

fastai allows you to freeze and unfreeze model layers.  So here we freeze everything but the weights in the last embedding layer

In [None]:
learner = md.get_model(
    opt_fn, embedding_size, num_hidden, num_layers, dropouti=drops[0], dropout=drops[1],
    wdrop=drops[2], dropoute=drops[3], dropouth=drops[4]
)

learner.metrics = [accuracy]

# freeze everything except last layer
learner.freeze_to(-1)

In [None]:
# load the weights
learner.model.load_state_dict(weights)

In [None]:
lr = 1e-3 # learning rate
lrs = lr

In [None]:
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=2)

In [None]:
# save our progress
learner.save('lm_last_ft')

In [None]:
# load back in
learner.load('lm_last_ft')

In [None]:
# now with our new embedding weights trained up, we can unfreeze and train all layers
learner.unfreeze()

In [None]:
# to find our learning rate
learner.lr_find(start_lr=lrs/10, end_lr=lrs*50, linear=True)

In [None]:
learner.sched.plot()

In [None]:
# looks like 10-2 or 10-3 or so could be a good learning rate for us

In [None]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=1)

In [None]:
# save our progress
learner.save('lm1')
learner.save_encoder('lm1_enc')

In [None]:
# taking a look at our loss
learner.sched.plot_loss()

## Tweet Sentiment Classifier

Now that we have our language model trained on tweets, we can start training our tweet sentiment classifier

To do this all we have to do is tack on a layer to our trained encoder.

In [126]:
train_df = pd.read_csv(CLASSIFICATION_PATH + '/train.csv', header=None, chunksize=chunksize)
val_df = pd.read_csv(CLASSIFICATION_PATH + '/test.csv', header=None, chunksize=chunksize)

In [127]:
# do the same cleaning we did for the language model
train_tokens, train_labels = get_all(train_df, 1)
val_tokens, val_labels = get_all(val_df, 1)

0 1
1 1
2 1
3 1
0 1


In [128]:
# make temp directory in classifier directory
os.mkdir(CLASSIFICATION_PATH + '/tmp')

# save tokens
np.save(CLASSIFICATION_PATH + '/tmp/tok_trn.npy', train_tokens)
np.save(CLASSIFICATION_PATH + '/tmp/tok_val.npy', val_tokens)

np.save(CLASSIFICATION_PATH + '/tmp/trn_labels.npy', train_labels)
np.save(CLASSIFICATION_PATH + '/tmp/val_labels.npy', val_labels)

FileExistsError: [Errno 17] File exists: '/home/ubuntu/data/twitter/twitter_classification//tmp'

In [129]:
# load back in 
train_tokens = np.load(CLASSIFICATION_PATH + '/tmp/tok_trn.npy')
val_tokens = np.load(CLASSIFICATION_PATH + '/tmp/tok_val.npy')

In [130]:
int_to_token = pickle.load(open(LANGUAGE_MODEL_PATH + '/tmp/itos.pkl', 'rb'))
token_to_int = collections.defaultdict(lambda: 0, {v:k for k, v in enumerate(int_to_token)})
len(int_to_token)

15833

In [131]:
train_classification = np.array([[token_to_int[o] for o in p] for p in train_tokens])
val_classification = np.array([[token_to_int[o] for o in p] for p in val_tokens])

In [132]:
np.save(CLASSIFICATION_PATH + '/tmp/trn_ids.npy', train_classification)
np.save(CLASSIFICATION_PATH + '/tmp/val_ids.npy', val_classification)

In [133]:
# load back in 
train_classification = np.load(CLASSIFICATION_PATH + '/tmp/trn_ids.npy')
val_classification = np.load(CLASSIFICATION_PATH + '/tmp/val_ids.npy')

train_labels = np.squeeze(np.load(CLASSIFICATION_PATH + '/tmp/trn_labels.npy'))
val_labels = np.squeeze(np.load(CLASSIFICATION_PATH + '/tmp/val_labels.npy'))

In [134]:
# params
bptt, embedding_size, num_hidden, num_layers = 70, 400, 1150, 3
num_tokens = len(int_to_token)
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
bs = 48

In [135]:
train_classification[:5], train_labels[:5]

(array([list([3, 4, 5, 2, 0, 906, 9, 17, 107, 235, 562, 8, 82, 27, 1568, 11, 551, 50, 2180, 3293]),
        list([3, 4, 5, 2, 2239, 15, 143, 6, 15, 66, 15, 10, 9, 15, 10, 15, 48, 15, 299, 8, 8, 8, 289, 276]),
        list([3, 4, 5, 2, 3834, 145, 6285, 33, 0, 152, 124, 3382]),
        list([3, 4, 5, 2, 0, 0, 0, 100, 11, 5132, 7, 1080, 31, 51, 0, 25, 0, 7, 151, 54, 129, 2166, 130, 23, 11, 15, 0, 7]),
        list([3, 4, 5, 2, 0, 6, 39, 71, 457, 12, 198, 58, 261, 20, 61, 6, 73])], dtype=object),
 array([0, 1, 0, 1, 1]))

In [136]:
min_label = train_labels.min()
train_labels -= min_label
val_labels -= min_label
c = int(train_labels.max()) + 1

In [137]:
train_ds = TextDataset(train_classification, train_labels)
val_ds = TextDataset(val_classification, val_labels)

# the sortish sampler helps by sorting things kinda sorta by their token length so padding isn't crazy
train_sampler = SortishSampler(train_classification, key=lambda x: len(train_classification[x]), bs=bs//2)
# doesn't matter so much for the validation set
val_sampler = SortSampler(val_classification, key=lambda x: len(val_classification[x]))

# get data loaders
train_dl = DataLoader(train_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=train_sampler)
val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_sampler)

# model data
md = ModelData(PATH, train_dl, val_dl)

In [138]:
# part 1
dps = np.array([0.4, 0.5, 0.05, 0.3, 0.1])

In [None]:
# part 2
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5

In [139]:
m = get_rnn_classifier(bptt, 20*70, c, num_tokens, emb_sz=embedding_size, n_hid=num_hidden, n_layers=num_layers,
                       pad_token=1, layers=[embedding_size*3, 50, c], drops=[dps[4], 0.1], dropouti=dps[0],
                       wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [140]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [141]:
learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learn.clip=25.
learn.metrics = [accuracy]

In [142]:
lr=3e-3
lrm = 2.6
lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr]) # differential learning rates

In [None]:
# lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

load our encoder from our tweet language model

In [None]:
wd = 1e-7
wd = 0
learn.load_encoder('lm1_enc')

In [None]:
# freeze all except last layer
learn.freeze_to(-1)

In [None]:
# to find learning rate
learn.lr_find(lrs/1000)
learn.sched.plot()

In [None]:
# little tough to tell here, but we'll go with what we set previously and what fastai used for their imdb dataset

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))

In [None]:
# save our first classifier
learn.save('clas_0')

In [None]:
# load it back in
learn.load('clas_0')

In [None]:
# unfreeze one more layer
learn.freeze_to(-2)

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))

In [None]:
# save our second classifier
learn.save('clas_1')

In [None]:
# load it back in
learn.load('clas_1')

In [None]:
# unfreeze all layers so we're training the whole network
learn.unfreeze()

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(32,10))

In [None]:
# plot out our loss
learn.sched.plot_loss()

In [None]:
# save our final classifier
learn.save('clas_2')

# Celebrity tweet sentiment

In [143]:
# load it back in
learn.load('clas_2')

In [None]:
log_predictions = learn.predict()

In [None]:
log_predictions[:10]

In [None]:
preds = np.argmax(log_predictions, axis=1)  # from log probabilities to 0 or 1
probs = np.exp(log_predictions[:,1])        # pr(dog)

In [None]:
preds[:10]

In [None]:
probs[:10]

In [None]:
md.val_y[:10]

make data loader for predicting

In [None]:
md.val_ds.y[:10]

In [None]:
md.val_ds.x

In [None]:
x = md.val_ds.x[:5]
y = md.val_ds.y[:5]

In [None]:
test_ds = TextDataset(x, y)

# doesn't matter so much for the validation set
# val_sampler = SortSampler(val_classification, key=lambda x: len(val_classification[x]))

# val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_sampler)
test_dl = DataLoader(test_ds, bs, transpose=True, num_workers=1, pad_idx=1)

In [None]:
log_preds = learn.predict_dl(test_dl)

In [None]:
log_preds

In [None]:
np.argmax(log_preds, axis=1)

In [120]:
# load our celebrity tweets
celebrity_df = pd.read_csv('/home/ubuntu/data/twitter/celebrity_tweets.csv', header=None)
celebrity_df_ints = pd.read_csv('/home/ubuntu/data/twitter/celebrity_tweets_ints.csv', header=None)

In [121]:
celebrity_tokens, _ = get_texts(celebrity_df_ints, 0)

In [122]:
int_to_token = pickle.load(open(LANGUAGE_MODEL_PATH + '/tmp/itos.pkl', 'rb'))
token_to_int = collections.defaultdict(lambda: 0, {v:k for k, v in enumerate(int_to_token)})
len(int_to_token)

15833

In [123]:
celebrity_classification = np.array([[token_to_int[o] for o in p] for p in celebrity_tokens])

In [144]:
celebrity_ds = TextDataset(celebrity_classification, np.zeros(len(celebrity_classification), dtype=int))

celebrity_dl = DataLoader(celebrity_ds, bs, transpose=True, num_workers=1, pad_idx=1)

In [146]:
log_preds = learn.predict_dl(celebrity_dl)

In [147]:
log_preds.shape

(2358, 2)

In [148]:
preds = np.argmax(log_preds, axis=1)  # from log probabilities to 0 or 1
probs = np.exp(log_preds[:,1])        # pr(dog)

In [149]:
preds

array([1, 1, 1, ..., 1, 1, 1])

In [155]:
celebrity_df = celebrity_df.assign(sentiment=pd.Series(preds))

In [161]:
celebrity_to_tweets = {}
for index, row in celebrity_df.iterrows():
    if row[0] not in celebrity_to_tweets:
        celebrity_to_tweets[row[0]] = []
    else:
        celebrity_to_tweets[row[0]].append({
            'tweet': row[1],
            'sentiment': row['sentiment']
        })

In [163]:
for screen_name, tweets in celebrity_to_tweets.items():
    avg_sentiment = np.mean([t['sentiment'] for t in tweets])
    print(screen_name, avg_sentiment)

BarackObama 0.6448863636363636
realDonaldTrump 0.6454293628808865
KimKardashian 0.7915194346289752
BillGates 0.6658031088082902
Check your answer here: nan
Oprah 0.820580474934037
justinbieber 0.9065040650406504
Beyonce 1.0
TheRock 0.8858858858858859


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [165]:
celebrity_to_tweets['TheRock']

[{'tweet': 'Thank ya K! 🙏🏾❤️ Preciate you rockin w us. @BallersHBO',
  'sentiment': 1},
 {'tweet': 'Thank you 🙏🏾\nWe’re proudly @HBO’s #1 half hour show for years now.',
  'sentiment': 1},
 {'tweet': 'Thx for watchin’ tonight 👊🏾 @BallersHBO', 'sentiment': 1},
 {'tweet': 'Thank ya Benny Boy for rockin’ w us. @BallersHBO Tonight @10pm @HBO 👊🏾',
  'sentiment': 1},
 {'tweet': 'Midnight. Iron Paradise. \nBeen shooting nights all week (9pm-6am) on Jungle Cruise so my hours are all turned around come Saturday night. \nTwo choices: \nCup of tea to relax and watch…',
  'sentiment': 1},
 {'tweet': '* swipe left for my crazyrichasians connection.\nHuge congratulations to my brotha from another, @jonmchu for directing such a brilliant and defining film, #CrazyRichAsians. \nIt’s a…',
  'sentiment': 1},
 {'tweet': '@vulture @farahbrook Me and Farah know how wonderful Twitter is as an outlet to assuage our anxiety. Especially when telling someone to fuck off. It’s quite beautiful 😂',
  'sentiment': 1