# Trialling batch sampling of tweets from user timelines

Package and function imports:

In [1]:
import sys
import os
import time
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd

import tweepy

In [2]:
sys.path.insert(1, '../')

In [3]:
from src.data import journalists as journos
from src.data.api_tweepy import connect_API
from src.data.api_tweet_tools import request_user_timeline
from src.data import data_cleanup as dc

## 1. Import and subsample list of usernames

In [4]:
df_in = pd.read_csv('../data/raw/cyber_journalist_friends_2.csv')
df_in.head()

Unnamed: 0,screen_name,friend
0,jennystrasburg,RobaHusseini
1,jennystrasburg,HashemOsseiran
2,jennystrasburg,liveanthony
3,jennystrasburg,EliseKapNM
4,jennystrasburg,adam_tooze


In [5]:
friends = list(df_in['friend'].unique())
sub_sample = friends

## 2. Init API

In [6]:
api_keys_fp = '../data/twitter_credentials.json'
tw_api = connect_API(api_keys_fp)

## 3. Loop over user sub-sample
Going to try using API code at scale, with cell magic timer to check durations.

In [7]:
N = len(sub_sample)
chunks = 20
N_sub = int(N/chunks)
with tqdm(total=N, desc='Users') as pbar:
    for j in range(chunks): # break up problem into 10 chunks
        tweet_list = [] # new list for each loop
        for i, user in enumerate(sub_sample[j*N_sub:(j+1)*N_sub]):
            results = request_user_timeline(tw_api, user, api_delay=0, kwargs={'tweet_mode':'extended', 'count':200})
            for tweet in results:
                tweet.pop('author')
            tweet_list.extend(results)
            pbar.update(1)
        # pickle current subset of results
        fp = '../data/raw/cyber_friend_tweets_subset_'+str(j)+'.pkl'
        with open(fp, 'wb') as f:
            pickle.dump(tweet_list, f)

Users: 100%|██████████████████████████████████████████████████████████████████▉| 18920/18926 [5:20:17<00:06,  1.02s/it]


Found that including the API delay, which ensures that API requests are uniformly distributed according to the rate limit, seems to more than double the time required.
However, the request rate is too quick without it, need a small delay to handle this gap

## 4. Turn data into dataframe and clean

First, need to load in pickled subsets one by one

In [4]:
def load_pickle_to_df(fp):
    with open(fp, 'rb') as f:
        data = pickle.load(f)
    user_data = []
    entities_data = []
    rt_data = []
    for tweet in data:
        user_data.append(vars(tweet.pop('user')))
        entities_data.append(tweet.pop('entities'))
        
    tmp_tweet_df = pd.DataFrame(data)
    rt_data = [vars(datum) if hasattr(datum, '__dict__') else {} for datum in list(tmp_tweet_df['retweeted_status'].values)]
    tmp_tweet_df.drop(['id_str', 'in_reply_to_status_id_str', 'in_reply_to_user_id_str', 'favorited', 'retweeted', 'retweeted_status'], 1, inplace=True) # drop columns that duplicate info
    tmp_tweet_df.rename(columns={'id':'tweet_id', 'created_at':'tweet_created_at', 'full_text':'text'}, inplace=True) # rename tweet id column
    
    tmp_user_df = pd.DataFrame(user_data)
    tmp_user_df.drop(tmp_user_df.columns.difference(['id','name', 'screen_name']), 1, inplace=True) # drop user info we don't want
    tmp_user_df.rename(columns={'id':'user_id'}, inplace=True) # rename user id col
    
    tmp_ent_df = pd.DataFrame(entities_data)
    tmp_ent_df['hashtags'] = tmp_ent_df['hashtags'].apply(lambda x : [hashtag['text'] for hashtag in x]) # turn hashtags into list of strings
    tmp_ent_df['user_mentions'] = tmp_ent_df['user_mentions'].apply(lambda x : [usr['screen_name'] for usr in x])
    tmp_ent_df.rename(columns={'user_mentions':'mentions'}, inplace=True)
    
    tmp_rt_df = pd.DataFrame(rt_data)
    tmp_rt_df['rt_user_id'] = tmp_rt_df['user'].apply(lambda x : x.id if hasattr(x, 'id') else None)
    tmp_rt_df['rt_screen_name'] = tmp_rt_df['user'].apply(lambda x : x.screen_name if hasattr(x, 'screen_name') else None)
    tmp_rt_df['rt_text'] = tmp_rt_df['full_text']
    tmp_rt_df.drop(tmp_rt_df.columns.difference(['id', 'rt_user_id', 'rt_screen_name', 'rt_text']), 1, inplace=True)
    tmp_rt_df.rename(columns={'id':'rt_id'}, inplace=True)
    
    tweet_df = pd.concat([tmp_user_df, tmp_tweet_df, tmp_ent_df, tmp_rt_df], axis=1, sort=False)
    
    return tweet_df

In [None]:
for i in range(20):
    subset_df = load_pickle_to_df('../data/raw/cyber_friend_tweets_subset_'+str(i)+'.pkl')
    midpoint = int(subset_df.shape[0]/2)
    subsetA = subset_df.iloc[:midpoint]
    subsetB = subset_df.iloc[midpoint:]
    subsetA.to_csv('../data/raw/cyber_friends_tweets_subset_'+str(2*i)+'.csv', index=False)
    subsetB.to_csv('../data/raw/cyber_friends_tweets_subset_'+str((2*i)+1)+'.csv', index=False)

In [18]:
test_df.memory_usage(deep=True).sum()/(1024**2)

294.69163703918457

In [7]:
df_cleaned = dc.standard_tweet_dataset_setup()
df_cleaned = dc.fill_standard_tweet_dataset_with_API(df_cleaned, test_df)

KeyError: 'retweet_text'

TypeError: vars() argument must have __dict__ attribute