<h1>Ditchley S2DS project August 2020 - Pipeline A<h1>
    <h2>Team: Adam Hawken, Luca Lamoni, Elizabeth Nicholson, Robert Webster<h2>
        
This notebook (A_pipeline) will be dedicated to:
- A1: Set up working directories
- A2: Getting journalist twitter handles according to a keyword
- A3: Scrape user information and friend lists for each journalist
- A4: Scrape friends user information using Twitter API
- A5: Scrape journalists tweets and mentions using either Twint and/or Twitter API
- A6: Journalists tweet data cleaning

<h3>Section A1: Set up working directories<h3>

In [None]:
# Import modules and set up working directory
import sys
import os
import time
import logging
import json
import csv
import threading
import queue
import asyncio 
import nest_asyncio
nest_asyncio.apply()
import twint
import pandas as pd


# Set up working directory
# The working directory should reflect the structure of the Github repository https://github.com/S2DSLondon/Aug20_Ditchley
sys.path.insert(1, 'C:/Users/Luca/Aug20_Ditchley/')
from src.data import pipeline_setup
pipeline_setup.build_data_dir('C:/Users/Luca/Aug20_Ditchley/')

<h3>Section A2: Getting journalist twitter handles according to a keyword<h3>
- The journalist scraping is performed at the web address https://www.journalism.co.uk/prof/?chunk=0&cmd=default<h4>

In [None]:
# Choose keyword and run the scraping function
from src.data import journalists as journos
keyword = 'cybersecurity'
# Input: string / Output: list
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))
journo_handles

<h3>Section A3. Scrape user information and friend lists for each journalist in the list<h3>
    <h4>A3.1: Scrape user information using the Twitter API<h4> 

In [None]:
#Load twitter API credentials and return a tweepy API instance
import json
import tweepy
from src.data import api_tweepy as api

# Input: path of json file with credentials / Output: tweepy.api.API
tw_api = api.connect_API('../src/data/twitter_credentials.json')

In [None]:
# Scrape user information using the API
from src.data import api_user_tools as api_tools
from src.data import data_cleanup as dc

# Input: tweepy.api.API,list / Output: list
api_users = api_tools.batch_request_user_info(tw_api,journo_handles)
# Input: list / Output: DataFrame
df_api = dc.populate_user_df(api_users)
# Check
df_api.head()

In [None]:
# Save the dataframe as csv
df_api.to_csv('../data/processed/'+keyword+'_user_profiles.csv', index = False)

<h4>A3.2: Scrape user friend list using Twint<h4> 

In [None]:
# Load functions
from src.data import twint_tools as tt

# define keyword arguments / 'n_retries' = max number of scrape attempts, 'suppress' = hide critical Twint warnings
kwargs = {'n_retries':5,
         'suppress':False}
# Multi threading function Input: _get_friends function, number of threads to distribute the queque (this must be changed according to the n. of cores
# on your machine, args = path where the individual csv will be saved and kwargs = see above
tt.twint_in_queue(tt._get_friends, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

In [None]:
# Concatenate all the individual lists into one dataframe with journalist and its friends
friends_csv = tt.join_friends_csv(journo_handles,keyword)
# Save the dataframe as csv
friends_csv.to_csv('../data/processed/'+keyword+'_journalist_friends.csv', index=False)

<h3>Section A4: Scrape friends user information using Twitter API<h3>

In [None]:
#Load twitter API credentials and return a tweepy API instance
import json
import tweepy
from src.data import api_tweepy as api

# Input: path of json file with credentials / Output: tweepy.api.API
tw_api = api.connect_API('../src/data/twitter_credentials.json')

# Scrape user information using the API
from src.data import api_user_tools as api_tools
from src.data import data_cleanup as dc
#from src.data.api_tweet_tools import request_user_timeline, batch_request_user_timeline
# Input: tweepy.api.API,list / Output: list
api_users_friends = api_tools.batch_request_user_info(tw_api, list(friends_csv['friend']))
# Input: list / Output: DataFrame
df_api_user_friends = dc.populate_user_df(api_users_friends)
# Save the dataframe as csv
df_api_user_friends.to_csv('../data/processed/'+keyword+'_user_friends_profiles.csv', index = False)

<h3>Section A5: Loop over selected journalists handles and scrape their tweets and mentions using either Twint (5.1) or the Twitter API (5.2)<h3>
    <h4>Section A5.1: Scrape tweets using Twint and extract mentions<h4> 

In [None]:
from src.data import twint_tools as tt
# define keyword arguments
kwargs = {'date_range':('2020-08-01 00:00:00', None),
         'n_retries':5,
         'suppress':False}
# Multi threading function Input: _get_friends function, number of threads to distribute the queque (this must be changed according to the n. of cores
# on your machine, args = path where the individual csv will be saved and kwargs = see above
tt.twint_in_queue(tt._search_tweets_by_user, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

# Join all the individual csv into one dataframe
cyber_test = tt.join_tweet_csv(journo_handles, keyword)
# Check
cyber_test.head()

In [None]:
# Save dataframe as csv
cyber_test.to_csv('../data/processed/'+keyword+'_journalist_tweets_twint.csv', index=False)

In [None]:
from src.data import data_cleanup as dc
# from the twint dataset, extract mentions based on tweet id and save in a separate csv
mentions_twint  = dc.mentions_to_df(cyber_test)
# Check
mentions_twint.head()

In [None]:
# Save the dataframe
mentions_twint.to_csv('../data/processed/' + keyword + '_mentions_twint.csv',index=False)

<h4>Section A5.2: Scrape tweets using Twitter API and extract mentions<h4>

In [None]:
import json
import tweepy
from src.data import api_tweepy as api
#Load twitter API credentials and return a tweepy API instance
tw_api = api.connect_API('../src/data/twitter_credentials.json')

from src.data.api_tweet_tools import request_user_timeline, batch_request_user_timeline
batch_request_user_timeline(tw_api, journo_handles, '../data/processed/',  n_tweets=3200)

In [None]:
from src.data import data_cleanup as dc
cyber_test_api = pd.read_csv('../data/processed/user_timelines_subset_0.csv')
# from the API dataset, extract mentions based on tweet id and save in a separate csv
mentions_api  = dc.mentions_to_df(cyber_test_api)
# Check
mentions_api.head()

In [None]:
mentions_api.to_csv('../data/processed/' + keyword + '_mentions_api.csv',index=False)

<h3>Section A6: Tweet data cleaning<h3>
     <h4>A6.1: Cleaning Twint dataset<h4>

In [None]:
# Standardise the twint output 
from src.data import data_cleanup as dc

# Standardize Twint dataset for graph DB loading
standard_tweet_twint = dc.clean_twint_dataframe(cyber_test)
# Save the dataframe
standard_tweet_twint.to_csv('../data/processed/' + keyword + '_standard_tweets_twint.csv',index=False)

<h4>A6.2: Cleaning API dataset<h4>

In [None]:
# Standardise the twint output 
from src.data import data_cleanup as dc

# Standardize API dataset for graph DB loading
standard_tweet_api = dc.clean_API_dataframe(cyber_test_api)

In [None]:
# Save the dataframe
standard_tweet_api.to_csv('../data/processed/' + keyword + '_standard_tweets_api.csv',index=False)