In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

In [2]:
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [3]:
import os
import sys
import json

if os.path.abspath('../..') not in sys.path:
    sys.path.append(os.path.abspath('../..'))

In [4]:
import tqdm
import datetime
import hashlib

import numpy as np
import pandas as pd

from dateutil.relativedelta import relativedelta

from src.dataset import load_dataset
from src.corpus import load_tweets

In [5]:
df = load_dataset()

In [6]:
df[['subject_id', 'participant']] \
    .drop_duplicates() \
    .reset_index(drop=False)['participant'] \
    .value_counts()

True     1489
False    1483
Name: participant, dtype: int64

In [7]:
tweets_collection = []

successful = 0
for item in tqdm.tqdm(df.itertuples(), total=df.shape[0], desc='Extracting Tweets'):
    if pd.isna(item.datetime):
        continue
    # event_time_month_start
    event_time = datetime.datetime.combine(item.datetime, datetime.time.min).replace(day=1)
    # part 1
    start_time = event_time + relativedelta(months=-11)
    current_year, current_month = start_time.year, start_time.month
    before_months = []
    while (current_year < event_time.year) or \
        ((current_year == event_time.year) and (current_month < event_time.month)):
        before_months.append({'twitter': item.twitter, 'year': current_year, 'month': current_month, 'lang': 'en'})
        if current_month == 12:
            current_month = 1
            current_year += 1
        else:
            current_month += 1
    # part 2
    end_time = event_time + relativedelta(months=+11)
    current_year, current_month = event_time.year, event_time.month
    after_months = []
    while (current_year < end_time.year) or \
        ((current_year == end_time.year) and (current_month < end_time.month)):
        # skips the event month
        if current_month == 12:
            current_month = 1
            current_year += 1
        else:
            current_month += 1
        after_months.append({'twitter': item.twitter, 'year': current_year, 'month': current_month, 'lang': 'en'})
    try:
        tc = []
        for t in load_tweets(filters=before_months, verbose=0):
            tc.append({'subject_id': item.subject_id, 'event_id': int(item.event_id), 'type': 'intra-subject', 'group': 'Before', 'tweet': t})
        for t in load_tweets(filters=after_months, verbose=0):
            tc.append({'subject_id': item.subject_id, 'event_id': int(item.event_id), 'type': 'intra-subject', 'group': 'After', 'tweet': t})
        successful += 1
        tweets_collection += tc
    except FileNotFoundError as e:
        pass

Extracting Tweets: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3659/3659 [04:20<00:00, 14.04it/s]


In [17]:
tweets_collection[0]['tweet']

{'created_at': '2020-05-13T10:03:16.000Z',
 'lang': 'en',
 'conversation_id': '1260461125252059138',
 'text': '@AspiringKeya @Pocket but fascinating',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 0,
  'quote_count': 0},
 'possibly_sensitive': False,
 'entities': {'mentions': [{'start': 14, 'end': 21, 'username': 'Pocket'}]},
 'reply_settings': 'everyone',
 'id': '1260510917495869441',
 'in_reply_to_user_id': '775772363673702400',
 'author_id': '68354597',
 'source': 'Twitter Web App',
 'referenced_tweets': [{'type': 'replied_to', 'id': '1260461951794151425'}],
 'username': 'TomAckermanWx'}

In [11]:
tweets_df = pd.DataFrame(tweets_collection)

In [16]:
tweets_df[['subject_id', 'event_id']].drop_duplicates().shape

(887, 2)

In [8]:
len(tweets_collection), successful

(880682, 1285)

In [54]:
is_rt = []

for record in tqdm.tqdm(tweets_collection, desc='Extracting RTs'):
    tweet = record['tweet']
    is_rt.append(tweet['text'].upper().startswith('RT'))

is_rt = np.array(is_rt)

rt = dict(zip(*np.unique(is_rt, return_counts=True)))

print('% of new messages: {:0.2f}%'.format(rt[False] / (rt[True] + rt[False])))

Extracting RTs: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 880682/880682 [00:03<00:00, 266246.21it/s]


% of new messages: 0.69%


In [55]:
with open('../../data/interim/tweets_intra_subject_analysis.jsonl', 'w') as fp:
    for item in tqdm.tqdm(tweets_collection, desc='Writing to File'):
        json.dump(item, fp)
        fp.write('\n')

Writing to File: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 880682/880682 [01:20<00:00, 10899.99it/s]
