# Tweet Turing Test: Detecting Disinformation on Twitter  

|          | Group #2 - Disinformation Detectors                     |
|---------:|---------------------------------------------------------|
| Members  | John Johnson, Katy Matulay, Justin Minnion, Jared Rubin |
| Notebook | `01_merge.ipynb`                                        |
| Purpose  | Merge together acquired data into a common format.      |

Our acquired data is stored in the following directory structure:  
- `/` (root project directory)
    - `/data`
        - `/data/raw`
            - `/data/raw/tweets-govt_entities`
            - `/data/raw/tweets-inidviduals`
            - `/data/raw/tweets-news_orgs`
            - `/data/raw/tweets-random`
        - `/data/processed`
            - `/data/processed/...`

> (TODO - write more explaining notebook)

# 1. Setup

In [2]:
# imports from Python standard library
import json
import logging
import os

# imports requiring installation
#   connection to Google Cloud Storage
from google.cloud import storage            # pip install google-cloud-storage
from google.oauth2 import service_account   # pip install google-auth

#  data science packages
import demoji                               # pip install demoji
import numpy as np                          # pip install numpy
import pandas as pd                         # pip install pandas

In [3]:
# imports from tweet_turing.py
from tweet_turing import get_json_files, merge_json_files, get_csv_files, merge_csv_files, \
    get_gcp_storage_client, get_gcp_bucket, list_gcp_objects, get_gcp_object_as_json, \
    get_gcp_object_as_text, set_gcp_object_from_json, merge_gcp_json_files, merge_gcp_csv_files, \
    set_gcp_object_from_df_as_parq

# imports from tweet_turing_paths.py
from tweet_turing_paths import local_data_paths, local_snapshot_paths, gcp_data_paths, \
    gcp_snapshot_paths, gcp_project_name, gcp_bucket_name, gcp_key_file

## Local or Cloud?
Decide here whether to run notebook with local data or GCP bucket data
 - if the working directory of this notebook has a "../data/" folder with data loaded (e.g. working on local computer or have data files loaded to a cloud VM) then use the "local files" option and comment out the "gcp bucket files" option
 - if this notebook is being run from a GCP VM (preferrably in the `us-central1` location) then use the "gcp bucket files" option and comment out the "local files" option 

In [4]:
# option: local files
#local_or_cloud: str = "local"   # comment/uncomment this line or next

# option: gcp bucket files
local_or_cloud: str = "cloud"   # comment/uncomment this line or previous

# don't comment/uncomment for remainder of cell
if (local_or_cloud == "local"):
    data_paths = local_data_paths
    snapshot_paths = local_snapshot_paths
elif (local_or_cloud == "cloud"):
    data_paths = gcp_data_paths
    snapshot_paths = gcp_snapshot_paths
else:
    raise ValueError("Variable 'local_or_cloud' can only take on one of two values, 'local' or 'cloud'.")
    # subsequent cells will not do this final "else" check

In [5]:
# this cell only needs to run its code if local_or_cloud=="cloud"
#   (though it is harmless if run when local_or_cloud=="local")
gcp_storage_client: storage.Client = None
gcp_bucket: storage.Bucket = None

if (local_or_cloud == "cloud"):
    gcp_storage_client = get_gcp_storage_client(project_name=gcp_project_name, key_file=gcp_key_file)
    gcp_bucket = get_gcp_bucket(storage_client=gcp_storage_client, bucket_name=gcp_bucket_name)

# 2. Troll Tweet CSV Files

In [6]:
# note this cell requires package `pyarrow` to be installed in environment
csv_filename: str = "csv_snapshot.parquet.snappy"
csv_path: str = f"{snapshot_paths['csv_snapshot']}{csv_filename}"

if (local_or_cloud == "local"):
    # load list of filenames/paths
    csv_file_list = get_csv_files(data_paths["troll"])

    # merge
    csv_df: pd.DataFrame = merge_csv_files(csv_file_list)

    # save result to a new file
    csv_df.to_parquet(csv_path, engine='pyarrow', index=False)
    
elif (local_or_cloud == "cloud"):
    # load list of filenames/paths
    csv_file_list = list_gcp_objects(storage_client=gcp_storage_client, bucket_name=gcp_bucket_name, obj_prefix=data_paths["troll"])
    
    # merge
    csv_df: pd.DataFrame = merge_gcp_csv_files(gcp_bucket, csv_file_list)
    
    # save result to a new file
    csv_blob: storage.Blob = gcp_bucket.blob(csv_path)
    csv_df.to_parquet(csv_blob.open("wb"), engine='pyarrow', index=False)

# 3. Twitter API JSON Files

> TODO - explain difference between `verified_user` / `verified_random`

In [7]:
# load JSON files to memory - nonrandom verified users
json_groups_nonrandom = ['govt_entities', 'individuals', 'news_orgs']

json_data_nonrandom = []

for this_group in json_groups_nonrandom:
    if (local_or_cloud == "local"):
        this_file_list = get_json_files(data_paths[this_group])
        this_json_data = merge_json_files(this_file_list)
    elif (local_or_cloud == "cloud"):
        this_obj_list = list_gcp_objects(
            storage_client=gcp_storage_client, 
            bucket_name=gcp_bucket_name, 
            obj_prefix=data_paths[this_group]
            )
        this_json_data = merge_gcp_json_files(gcp_bucket, this_obj_list)
    
    json_data_nonrandom.extend(this_json_data)

# apply data_source label for these groups
for tweet in json_data_nonrandom:
    tweet['data_source'] = "verified_user"

In [8]:
# load JSON files to memory - random verified users
json_groups_random = ['random']

json_data_random = []

for this_group in json_groups_random:
    if (local_or_cloud == "local"):
        this_file_list = get_json_files(data_paths[this_group])
        this_json_data = merge_json_files(this_file_list)
    elif (local_or_cloud == "cloud"):
        this_obj_list = list_gcp_objects(
            storage_client=gcp_storage_client, 
            bucket_name=gcp_bucket_name, 
            obj_prefix=data_paths[this_group]
            )
        this_json_data = merge_gcp_json_files(gcp_bucket, this_obj_list)
    
    json_data_random.extend(this_json_data)

# apply data_source label for these groups
for tweet in json_data_random:
    tweet['data_source'] = "verified_random"

In [9]:
# with `data_source` labels applied, merge the two lists with JSON data
json_data = []
json_data.extend(json_data_nonrandom)
json_data.extend(json_data_random)

# normalize into a pandas dataframe (intermediate step to get to parquet easily)
json_df = pd.json_normalize(json_data)

In [10]:
# note this cell requires package `pyarrow` to be installed in environment
# save result to a new file
json_filename: str = "json_snapshot.parquet.snappy"
json_path: str = f"{snapshot_paths['json_snapshot']}{json_filename}"

if (local_or_cloud == "local"):
    # save a local file
    json_df.to_parquet(json_path, engine="pyarrow", index=False)
elif (local_or_cloud == "cloud"):
    # save to the bucket
    set_gcp_object_from_df_as_parq(bucket=gcp_bucket, object_name=json_path, df=json_df)