<h1>Ditchley S2DS project August 2020 - Code Pipeline<h1>
    <h2>Team: Adam Hawken, Luca Lamoni, Elizabeth Nicholson, Robert Webster<h2>

In [None]:
# Import modules and set up working directory
import sys
import os
import time
import logging
import json
import csv
import threading
import queue
import asyncio 
import nest_asyncio
nest_asyncio.apply()
import twint
import pandas as pd

sys.path.insert(1, 'C:/Users/Luca/Aug20_Ditchley')

<h3>Section 1: Getting journalist twitter handles according to a keyword<h3>
    <h4>The journalist scraping is performed at the web address https://www.journalism.co.uk/prof/?chunk=0&cmd=default<h4>

In [None]:
from src.data import journalists as journos
keyword = 'cybersecurity'
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))

<h3>Section 2. Loop over selected journalists handles and scrape their tweets (2.1), mentions (2.2), list of friends (2.3) and user information (2.4) using Twint and Twitter API <h3>
    <h4>Section 2.1: Scrape tweets using Twint<h4> 

In [None]:
from src.data import twint_tools as tt
# define keyword arguments
kwargs = {'date_range':('2020-08-01 00:00:00', None),
         'n_retries':5,
         'suppress':False}
# multi threading
tt.twint_in_queue(tt._search_tweets_by_user, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

In [None]:
# Joined all the individual csv into one dataframe
cyber_test = tt.join_tweet_csv(journo_handles, keyword)
# Check
cyber_test.head()

In [None]:
# Standardise the twint output 
from src.data import data_cleanup as dc
#Create the standardized template
test_twint = dc.standard_tweet_dataset_setup('test_twint')
test_twint
#fill the template
standard_tweet_twint = dc.fill_standard_tweet_dataset_with_twint(test_twint, cyber_test)
# Check
standard_tweet_twint.head()

In [None]:
# Save the dataframe
standard_tweet_twint.to_csv('../data/processed/' + keyword + '_standard_tweets_twint.csv',index=False)

<h4>Section 2.2: Extract mentions from Twint dataset<h4> 

In [None]:
# from the twint dataset, extract mentions based on tweet id and save in a separate csv
mentions_twint  = dc.twint_mentions_to_df(cyber_test)
# Check
mentions_twint.head()

In [None]:
# Save the dataframe
mentions_twint.to_csv('../data/processed/' + keyword + '_mentions_twint.csv',index=False)

<h4>Section 2.3: Scrape list of friends for each journalist using Twint<h4> 

In [None]:
from src.data import twint_tools as tt
# define keyword arguments
kwargs = {#'date_range':('2020-08-01 00:00:00', None),
         'n_retries':5,
         'suppress':False}
# multi threading
tt.twint_in_queue(tt._get_friends, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

In [None]:
# Concatenate all the individual lists into one dataframe with journalist and its friends
friends_csv = tt.join_friends_csv(journo_handles,keyword) # this function has a bug, the first friend name is 'username'

In [None]:
# Save the dataframe
friends_csv.to_csv('../data/raw/'+keyword+'_journalist_friends.csv', index=False)

<h4>Section 2.4: Scrape journalist user information using Twitter API<h4> 

In [None]:
import json
import tweepy
from src.data import api_tweepy as api
#Load twitter API credentials and return a tweepy API instance
tw_api = api.connect_API('../src/data/twitter_credentials.json')

In [None]:
# Scrape user information using the API
from src.data import api_user_tools as api_tools
from src.data import data_cleanup as dc
api_users = api_tools.batch_request_user_info(tw_api,journo_handles)
df_api = dc.populate_user_df(api_users)
# Check
df_api.head()

In [None]:
# Save the dataframe
df_api.to_csv('../data/raw/'+keyword+'_user_profiles.csv', index = False)

<h3>Section 3. Loop over selected journalists handles and scrape their tweets (3.1), mentions (3.2), list of friends (3.3) and user information (3.4) using Twitter API and  Twint<h3>
    <h4>Section 3.1: Scrape tweets using Twitter API<h4> 

In [None]:
import json
import tweepy
from src.data import api_tweepy as api
#Load twitter API credentials and return a tweepy API instance
tw_api = api.connect_API('../src/data/twitter_credentials.json')

<h4>Section 3.2: Extract mentions from API tweets<h4> 


<h4>Section 3.3: Scrape list of friends for each journalist using Twint<h4>    

In [None]:
from src.data import twint_tools as tt
# define keyword arguments
kwargs = {#'date_range':('2020-08-01 00:00:00', None),
         'n_retries':5,
         'suppress':False}
# multi threading
tt.twint_in_queue(tt._get_friends, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

In [None]:
# Concatenate all the individual lists into one dataframe with journalist and its friends
friends_csv = tt.join_friends_csv(journo_handles,keyword) # this function has a bug, the first friend name is 'username'

In [None]:
# Save the dataframe
friends_csv.to_csv('../data/raw/'+keyword+'_journalist_friends.csv', index=False)

<h4>Section 3.4: Scrape journalist user information using Twitter API<h4>

In [None]:
import json
import tweepy
from src.data import api_tweepy as api
#Load twitter API credentials and return a tweepy API instance
tw_api = api.connect_API('../src/data/twitter_credentials.json')

In [None]:
# Scrape user information using the API
from src.data import api_user_tools as api_tools
from src.data import data_cleanup as dc
api_users = api_tools.batch_request_user_info(tw_api,journo_handles)
df_api = dc.populate_user_df(api_users)
# Check
df_api.head()

In [None]:
# Save the dataframe
df_api.to_csv('../data/raw/'+keyword+'_user_profiles.csv', index = False)

<h3>Section 4. Create graph database and import twitter data into it<h3>
    <h4>Section 4.1: Dowload modules, set up libraries and load graph database<h4> 

In [None]:
!cd "C:/Users/Luca/.Neo4jDesktop/neo4jDatabases/database-2044c9dc-0a4d-4713-be0a-bcb0001ce4a4/installation-4.1.0"

In [None]:
!pip install pytest-cov
!pip install pytest-filter-subpackage
!pip install py2neo

In [None]:
# import standard libraries
import numpy as np
import pandas as pd

from py2neo import Graph
from py2neo.data import Node, Relationship

# load / declare the database
graph = Graph("bolt://localhost:7687", user="neo4j", password="tweetoftheday")
graph.begin()

<h4>Section 4.2: Load data for<h4> 

In [None]:
# start with an empty graph, obviously don't run this if you already have stuff in there you don't want to delete
graph.delete_all()

In [None]:
# First we want to load in our tweet information, to do this we need to put the file containing the tweets in the location/import directory
!mv ~/Downloads/standardised_cyber_tweets.csv import/

<h4>Section 4.3. Create nodes representing tweets and users<h4>

In [None]:
# Below, we use three cypher commands. The first loads the file. The second creates nodes representing tweets. The third creates nodes representing people. 
#Note that "CREATE" and "MERGE" are slightly different. "CREATE" makes a new node but if that node already exists then it does nothing. 
#"MERGE" creates a node if it doesn't already exist, and if it does exist will add or update information.

In [None]:
# load in tweets and twitter user information
query_string = '''
   LOAD CSV WITH HEADERS FROM "file:///standardised_cyber_tweets.csv" AS row
   
   CREATE (t:Tweet {tweet_id: row.tweet_id, conversation_id: row.conversation_id, user_id: row.user_id, 
   reply_to: row.reply_to, tweet_created_at_date: row.tweet_created_at_date, 
   tweet_created_at_time: row.tweet_created_at_time, text: row.text, replies_count: row.replies_count, 
   retweets_count: row.reteets_count, favourite_count: row.favourite_count, likes_count: row.likes_count,
   hashtags: row.hashtags, topics: row.topics})
   
   MERGE (p:Person {user_id: row.user_id, screen_name: row.screen_name, name: row.name, 
   user_description: row.user_description, user_friends_n: row.user_friends_n, user_followers_n: row.user_followers_n, 
   prof_created_at: row.prof_created_at, favourites_count: row.favourites_count, verified: row.verified, 
   statuses_count: row.statuses_count});
   '''
# run cypher query
graph.run(query_string)