# Trialling batch sampling of tweets from user timelines

Package and function imports:

In [1]:
import sys
import os
import time
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd

import tweepy

In [2]:
sys.path.insert(1, '../')

In [3]:
from src.data import journalists as journos
from src.data.api_tweepy import connect_API
from src.data.api_tweet_tools import request_user_timeline
from src.data import data_cleanup as dc

## 1. Import and subsample list of usernames

In [4]:
df_in = pd.read_csv('../data/raw/cyber_journalist_friends_2.csv')
df_in.head()

Unnamed: 0,screen_name,friend
0,jennystrasburg,RobaHusseini
1,jennystrasburg,HashemOsseiran
2,jennystrasburg,liveanthony
3,jennystrasburg,EliseKapNM
4,jennystrasburg,adam_tooze


In [5]:
friends = list(df_in['friend'].unique())
sub_sample = friends

## 2. Init API

In [6]:
api_keys_fp = '../data/twitter_credentials.json'
tw_api = connect_API(api_keys_fp)

## 3. Loop over user sub-sample
Going to try using API code at scale, with cell magic timer to check durations.

In [7]:
N = len(sub_sample)
chunks = 20
N_sub = int(N/chunks)
with tqdm(total=N, desc='Users') as pbar:
    for j in range(chunks): # break up problem into 10 chunks
        tweet_list = [] # new list for each loop
        for i, user in enumerate(sub_sample[j*N_sub:(j+1)*N_sub]):
            results = request_user_timeline(tw_api, user, api_delay=0, kwargs={'tweet_mode':'extended', 'count':200})
            for tweet in results:
                tweet.pop('author')
            tweet_list.extend(results)
            pbar.update(1)
        # pickle current subset of results
        fp = '../data/raw/cyber_friend_tweets_subset_'+str(j)+'.pkl'
        with open(fp, 'wb') as f:
            pickle.dump(tweet_list, f)

Users: 100%|██████████████████████████████████████████████████████████████████▉| 18920/18926 [5:20:17<00:06,  1.02s/it]


Found that including the API delay, which ensures that API requests are uniformly distributed according to the rate limit, seems to more than double the time required.
However, the request rate is too quick without it, need a small delay to handle this gap

## 4. Turn data into dataframe and clean

First, need to load in pickled subsets one by one

In [4]:
def load_pickle_to_df(fp):
    with open(fp, 'rb') as f:
        data = pickle.load(f)
    user_data = []
    entities_data = []
    rt_data = []
    for tweet in data:
        user_data.append(vars(tweet.pop('user')))
        entities_data.append(tweet.pop('entities'))
        
    tmp_tweet_df = pd.DataFrame(data)
    rt_data = [vars(datum) if hasattr(datum, '__dict__') else {} for datum in list(tmp_tweet_df['retweeted_status'].values)]
    tmp_tweet_df.drop(['id_str', 'in_reply_to_status_id_str', 'in_reply_to_user_id_str', 'favorited', 'retweeted', 'retweeted_status'], 1, inplace=True) # drop columns that duplicate info
    tmp_tweet_df.rename(columns={'id':'tweet_id', 'created_at':'tweet_created_at', 'full_text':'text'}, inplace=True) # rename tweet id column
    
    tmp_user_df = pd.DataFrame(user_data)
    tmp_user_df.drop(tmp_user_df.columns.difference(['id','name', 'screen_name']), 1, inplace=True) # drop user info we don't want
    tmp_user_df.rename(columns={'id':'user_id'}, inplace=True) # rename user id col
    
    tmp_ent_df = pd.DataFrame(entities_data)
    tmp_ent_df['hashtags'] = tmp_ent_df['hashtags'].apply(lambda x : [hashtag['text'] for hashtag in x]) # turn hashtags into list of strings
    tmp_ent_df['user_mentions'] = tmp_ent_df['user_mentions'].apply(lambda x : [usr['screen_name'] for usr in x])
    tmp_ent_df.rename(columns={'user_mentions':'mentions'}, inplace=True)
    
    tmp_rt_df = pd.DataFrame(rt_data)
    tmp_rt_df['rt_user_id'] = tmp_rt_df['user'].apply(lambda x : x.id if hasattr(x, 'id') else None)
    tmp_rt_df['rt_screen_name'] = tmp_rt_df['user'].apply(lambda x : x.screen_name if hasattr(x, 'screen_name') else None)
    tmp_rt_df['rt_text'] = tmp_rt_df['full_text']
    tmp_rt_df.drop(tmp_rt_df.columns.difference(['id', 'rt_user_id', 'rt_screen_name', 'rt_text']), 1, inplace=True)
    tmp_rt_df.rename(columns={'id':'rt_id'}, inplace=True)
    
    tweet_df = pd.concat([tmp_user_df, tmp_tweet_df, tmp_ent_df, tmp_rt_df], axis=1, sort=False)
    
    return tweet_df

In [None]:
for i in range(20):
    subset_df = load_pickle_to_df('../data/raw/cyber_friend_tweets_subset_'+str(i)+'.pkl')
    midpoint = int(subset_df.shape[0]/2)
    subsetA = subset_df.iloc[:midpoint]
    subsetB = subset_df.iloc[midpoint:]
    subsetA.to_csv('../data/raw/cyber_friends_tweets_subset_'+str(2*i)+'.csv', index=False)
    subsetB.to_csv('../data/raw/cyber_friends_tweets_subset_'+str((2*i)+1)+'.csv', index=False)

In [18]:
test_df.memory_usage(deep=True).sum()/(1024**2)

294.69163703918457

In [None]:
df_cleaned = dc.standard_tweet_dataset_setup()
df_cleaned = dc.fill_standard_tweet_dataset_with_API(df_cleaned, test_df)

TypeError: vars() argument must have __dict__ attribute

In [5]:
subset_df = pd.read_csv('../data/raw/cyber_friend_tweets/cyber_friends_tweets_subset_0.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
test = dc.init_cleaned_tweet_df()

In [21]:
trial = pd.concat([test, subset_df[subset_df.columns.intersection(test.columns)]])

In [22]:
trial.head()

Unnamed: 0,user_id,screen_name,name,tweet_id,text,tweet_created_at,conversation_id,in_reply_to_status_id,rt_id,rt_user_id,rt_screen_name,rt_text,in_reply_to_user_id,in_reply_to_screen_name,replies_count,retweets_count,likes_count,hashtags
0,363675945,RobaHusseini,Roba El Husseini ربى,1294203275043110912,"Beirut blast: who knew what, when? \nMy latest...",2020-08-14 09:24:40,,,,,,,,,,,,[]
1,363675945,RobaHusseini,Roba El Husseini ربى,1293934001514848256,RT @LayalAFP: #Breaking via @AFP #FBI to join...,2020-08-13 15:34:40,,,1.293919e+18,546784200.0,LayalAFP,#Breaking via @AFP #FBI to join #Beirut blast...,,,,,,"['Breaking', 'FBI', 'Beirut', 'probe']"
2,363675945,RobaHusseini,Roba El Husseini ربى,1293580602055299072,RT @AFP_Beirut: #BREAKING \nLebanon prosecutor...,2020-08-12 16:10:23,,,1.29358e+18,9.725363e+17,AFP_Beirut,#BREAKING \nLebanon prosecutor to question sev...,,,,,,"['BREAKING', 'BeirutBlast']"
3,363675945,RobaHusseini,Roba El Husseini ربى,1293160356240592896,"When the boy saw blood on his feet, ""he starte...",2020-08-11 12:20:29,,,,,,,,,,,,[]
4,363675945,RobaHusseini,Roba El Husseini ربى,1292555363393458179,RT @DrMaramAlsheikh: https://t.co/tO4WexWyui,2020-08-09 20:16:27,,,1.292555e+18,251765200.0,DrMaramAlsheikh,https://t.co/tO4WexWyui,,,,,,[]


In [24]:
lowercase_list = ['in_reply_to_screen_name', 'screen_name', 'rt_screen_name', 'hashtags']
for item in lowercase_list:
    trial[item] = trial[item].str.lower()
trial.head()

Unnamed: 0,user_id,screen_name,name,tweet_id,text,tweet_created_at,conversation_id,in_reply_to_status_id,rt_id,rt_user_id,rt_screen_name,rt_text,in_reply_to_user_id,in_reply_to_screen_name,replies_count,retweets_count,likes_count,hashtags
0,363675945,robahusseini,Roba El Husseini ربى,1294203275043110912,"Beirut blast: who knew what, when? \nMy latest...",2020-08-14 09:24:40,,,,,,,,,,,,[]
1,363675945,robahusseini,Roba El Husseini ربى,1293934001514848256,RT @LayalAFP: #Breaking via @AFP #FBI to join...,2020-08-13 15:34:40,,,1.293919e+18,546784200.0,layalafp,#Breaking via @AFP #FBI to join #Beirut blast...,,,,,,"['breaking', 'fbi', 'beirut', 'probe']"
2,363675945,robahusseini,Roba El Husseini ربى,1293580602055299072,RT @AFP_Beirut: #BREAKING \nLebanon prosecutor...,2020-08-12 16:10:23,,,1.29358e+18,9.725363e+17,afp_beirut,#BREAKING \nLebanon prosecutor to question sev...,,,,,,"['breaking', 'beirutblast']"
3,363675945,robahusseini,Roba El Husseini ربى,1293160356240592896,"When the boy saw blood on his feet, ""he starte...",2020-08-11 12:20:29,,,,,,,,,,,,[]
4,363675945,robahusseini,Roba El Husseini ربى,1292555363393458179,RT @DrMaramAlsheikh: https://t.co/tO4WexWyui,2020-08-09 20:16:27,,,1.292555e+18,251765200.0,drmaramalsheikh,https://t.co/tO4WexWyui,,,,,,[]


In [12]:
users = list(set(list(subset_df['screen_name'].values)))
print(len(users))

469


In [16]:
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90345 entries, 0 to 90344
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   user_id                  90345 non-null  int64  
 1   name                     90345 non-null  object 
 2   screen_name              90345 non-null  object 
 3   tweet_created_at         90345 non-null  object 
 4   tweet_id                 90345 non-null  int64  
 5   text                     90345 non-null  object 
 6   truncated                90345 non-null  bool   
 7   display_text_range       90345 non-null  object 
 8   source                   90344 non-null  object 
 9   source_url               90344 non-null  object 
 10  in_reply_to_status_id    29629 non-null  float64
 11  in_reply_to_user_id      30060 non-null  float64
 12  in_reply_to_screen_name  30060 non-null  object 
 13  geo                      86 non-null     object 
 14  coordinates           

In [19]:
trial_df = subset_df[subset_df['screen_name']==users[0]]

In [24]:
trial_df['total_engagements'] = trial_df['retweet_count']+trial_df['favorite_count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
def hIndex(citations):
    """
    :type citations: List[int]
    """
    if not citations:
        return 0
    citations.sort()
    for i in range(1,len(citations)+1)[::-1]:
        if citations[-i] >= i:
            return i
    return 0

In [28]:
user_indices = {}
for user in users:
    user_df = subset_df[subset_df['screen_name']==user]
    user_df['total_engagements'] = user_df['retweet_count']+user_df['favorite_count']
    user_indices[user] = hIndex(list(user_df['total_engagements'].values))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [30]:
sort_users = sorted(user_indices.items(), key=lambda x: x[1], reverse=True)

In [35]:
sort_users[:50]

[('cutedog_f', 195),
 ('BarbraStreisand', 195),
 ('KyloR3n', 193),
 ('sarahcpr', 191),
 ('StateDept', 188),
 ('GretaThunberg', 179),
 ('RoyalFamily', 179),
 ('pattonoswalt', 177),
 ('ncwhm', 176),
 ('EsperDoD', 174),
 ('rgoodlaw', 171),
 ('neal_katyal', 170),
 ('SenSasse', 168),
 ('Bandreescu_', 161),
 ('MalcolmNance', 160),
 ('Simone_Biles', 159),
 ('MKBHD', 153),
 ('michaelharriot', 152),
 ('PatinkinMandy', 151),
 ('ezraklein', 147),
 ('Sifill_LDF', 142),
 ('GeoffRBennett', 141),
 ('RandyRainbow', 137),
 ('Vegalteno', 135),
 ('Jkylebass', 134),
 ('AljazSkorjanec', 133),
 ('olgaNYC1211', 130),
 ('BoutrousTed', 127),
 ('NatSecLisa', 126),
 ('shomaristone', 126),
 ('MollyJongFast', 124),
 ('MNateShyamalan', 124),
 ('WHNSC', 122),
 ('MatthewACherry', 121),
 ('chunkbardey', 121),
 ('NCSCgov', 121),
 ('KimletGordon', 119),
 ('TwoPaddocks', 116),
 ('gw27', 115),
 ('PickardJE', 114),
 ('EricTopol', 112),
 ('IanMcKellen', 110),
 ('sparkleaddict', 109),
 ('classicdad', 108),
 ('AndyBeshearKY',

## Cleaning Tweets
Code blocks below are from Liz:

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text) #remove urls
    text=re.sub(r'\S+\.com\S+','',text) #remove urls
    text=re.sub(r'\@\w+','',text) #remove mentions
    text =re.sub(r'\#\w+','',text) #remove hashtags
    return text
df['clean_tweet'] = df['text'].apply(lambda x: clean_text(x))

In [None]:
def normalized_df_text(text): #normalizing, stop word removal, & lementizing
    normalized =[]
    for i in range(0,text.shape[0]):
        norm_temp = re.findall(r'[A-Za-z]+',text.iloc[i]) #taking all words (leaving punctuation out)
        norm_temp = [w.lower() for w in norm_temp] # putting words in lower case
        norm_temp = [w for w in norm_temp if not w in stop_words] # removing stopwords
        lemma = WordNetLemmatizer()
        norm_temp = [lemma.lemmatize(w, pos = "v") for w in norm_temp] #lemmatizing verbs
        norm_temp = [lemma.lemmatize(w, pos = "n") for w in norm_temp] #lemmatizing nouns
        norm_temp = [' '.join(norm_temp)]
        normalized.append(norm_temp)
    return normalized
normalized = normalized_df_text(df['clean_tweet'])
df['normalized_tweet'] = normalized

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text) #remove urls
    text=re.sub(r'\S+\.com\S+','',text) #remove urls
    text=re.sub(r'\@\w+','',text) #remove mentions
    text =re.sub(r'\#','',text) #remove hashtags
    text = re.findall(r'[A-Za-z]+',text)
    text = [' '.join(text)]
    return text
df['clean_tweet'] = df['text'].apply(lambda x: clean_text(x))

Now to wrangle these into a form that I can use