# Tweet Turing Test: Detecting Disinformation on Twitter  

|          | Group #2 - Disinformation Detectors                     |
|---------:|---------------------------------------------------------|
| Members  | John Johnson, Katy Matulay, Justin Minnion, Jared Rubin |
| Notebook | `01_merge.ipynb`                                        |
| Purpose  | Merge together acquired data into a common format.      |

Our acquired data is stored in the following directory structure:  
- `/` (root project directory)
    - `/data`
        - `/data/raw`
            - `/data/raw/tweets-govt_entities`
            - `/data/raw/tweets-inidviduals`
            - `/data/raw/tweets-news_orgs`
            - `/data/raw/tweets-random`
        - `/data/processed`
            - `/data/processed/...`

> (TODO - write more explaining notebook)

# 1. Setup

In [1]:
# imports from Python standard library
import json
import logging
import os

# imports requiring installation
#   connection to Google Cloud Storage
from google.cloud import storage            # pip install google-cloud-storage
from google.oauth2 import service_account   # pip install google-auth

#  data science packages
import demoji                               # pip install demoji
import numpy as np                          # pip install numpy
import pandas as pd                         # pip install pandas

In [2]:
# imports from tweet_turing.py
from tweet_turing import get_json_files, merge_json_files, get_csv_files, merge_csv_files, \
    get_gcp_storage_client, get_gcp_bucket, list_gcp_objects, get_gcp_object_as_json, \
    get_gcp_object_as_text, merge_gcp_json_files

In [3]:
# setup location of data files
#   files when loading local data
local_data_paths: dict[str, str] = {
    "govt_entities": "../data/raw/tweets-govt_entities/",
    "individuals": "../data/raw/tweets-individuals/",
    "news_orgs": "../data/raw/tweets-news_orgs/",
    "random": "../data/raw/tweets-random/",
    "troll": "../data/raw/tweets-troll"
}

local_snapshot_paths: dict[str, str] = {
    "json_snapshot": "../data/snapshot/",
    "csv_snapshot": "../data/snapshot/"
}

#   files when loading data from GCP bucket
gcp_data_paths: dict[str, str] = {
    "govt_entities": "raw/govt/",
    "individuals": "raw/individuals/",
    "news_orgs": "raw/news/",
    "random": "raw/random/",
    "troll": "raw/troll/"
}

gcp_snapshot_paths: dict[str, str] = {
    "json_snapshot": "snapshot/",
    "csv_snapshot": "snapshot/"
}

gcp_project_name: str = "ds-capstone-jmmr"
gcp_bucket_name: str = "disinfo-detector-tweet-turing-test"
gcp_key_file: str = "../key/service_acct_key.json"

## Local or Cloud?
Decide here whether to run notebook with local data or GCP bucket data
 - if the working directory of this notebook has a "../data/" folder with data loaded (e.g. working on local computer or have data files loaded to a cloud VM) then use the "local files" option and comment out the "gcp bucket files" option
 - if this notebook is being run from a GCP VM (preferrably in the `us-central1` location) then use the "gcp bucket files" option and comment out the "local files" option 

In [4]:
# option: local files
local_or_cloud: str = "local"   # comment/uncomment this line or next

# option: gcp bucket files
#local_or_cloud: str = "cloud"   # comment/uncomment this line or previous

# don't comment/uncomment for remainder of cell
if (local_or_cloud == "local"):
    data_paths: dict[str, str] = local_data_paths
    snapshot_paths: dict[str, str] = local_snapshot_paths
elif (local_or_cloud == "cloud"):
    data_paths: dict[str, str] = gcp_data_paths
    snapshot_paths: dict[str, str] = gcp_snapshot_paths
else:
    raise ValueError("Variable 'local_or_cloud' can only take on one of two values, 'local' or 'cloud'.")
    # subsequent cells will not do this final "else" check

In [None]:
# this cell only needs to run its code if local_or_cloud=="cloud"
gcp_storage_client: storage.Client = None
gcp_bucket: storage.Bucket = None

if (local_or_cloud == "cloud"):
    gcp_storage_client = get_gcp_storage_client(project_name=gcp_project_name, key_file=gcp_key_file)
    gcp_bucket = get_gcp_bucket(storage_client=gcp_storage_client, bucket_name=gcp_bucket_name)

# 2. Troll Tweet CSV Files

In [5]:
csv_filename: str = "csv_snapshot.csv"

if (local_or_cloud == "local"):
    # load list of filenames/paths
    csv_file_list: list[str] = get_csv_files(data_paths["troll"])

    # merge
    csv_df: pd.DataFrame = merge_csv_files(csv_file_list)

    # save result to a new file
    #   resource warning: file size is ~1.08 GB
    csv_path: str = f"{snapshot_paths['csv_snapshot']}{csv_filename}"
    csv_df.to_csv(csv_path, encoding='utf-8')
    
elif (local_or_cloud == "cloud"):
    pass    # TODO

  new_df: pd.DataFrame = pd.read_csv(file, encoding='utf-8')
  new_df: pd.DataFrame = pd.read_csv(file, encoding='utf-8')
  new_df: pd.DataFrame = pd.read_csv(file, encoding='utf-8')


# 3. Twitter API JSON Files

> TODO - explain difference between `verified_user` / `verified_random`

In [6]:
# load JSON files to memory - nonrandom verified users
json_groups_nonrandom: list[str] = ['govt_entities', 'individuals', 'news_orgs']

json_data_nonrandom: list[dict] = []

for this_group in json_groups_nonrandom:
    if (local_or_cloud == "local"):
        this_file_list: list[str] = get_json_files(data_paths[this_group])
        this_json_data: list[dict] = merge_json_files(this_file_list)
    elif (local_or_cloud == "cloud"):
        this_obj_list: list[str] = list_gcp_objects(
            storage_client=gcp_storage_client, 
            bucket_name=gcp_bucket_name, 
            obj_prefix=data_paths[this_group]
            )
        this_json_data: list[dict] = merge_gcp_json_files(gcp_bucket, this_obj_list)
    
    json_data_nonrandom.extend(this_json_data)

# apply data_source label for these groups
for tweet in json_data_nonrandom:
    tweet['data_source'] = "verified_user"

In [26]:
# load JSON files to memory - random verified users
json_groups_random: list[str] = ['random']

json_data_random: list[dict] = []

for this_group in json_groups_random:
    if (local_or_cloud == "local"):
        this_file_list: list[str] = get_json_files(data_paths[this_group])
        this_json_data: list[dict] = merge_json_files(this_file_list)
    elif (local_or_cloud == "cloud"):
        this_obj_list: list[str] = list_gcp_objects(
            storage_client=gcp_storage_client, 
            bucket_name=gcp_bucket_name, 
            obj_prefix=data_paths[this_group]
            )
        this_json_data: list[dict] = merge_gcp_json_files(gcp_bucket, this_obj_list)
    
    json_data_random.extend(this_json_data)

# apply data_source label for these groups
for tweet in json_data_random:
    tweet['data_source'] = "verified_random"

In [27]:
# with `data_source` labels applied, merge the two lists with JSON data
json_data = []
json_data.extend(json_data_nonrandom)
json_data.extend(json_data_random)

In [28]:
# save result to a new file
#   resource warning: file size is ~3.67 GB
json_filename: str = "json_snapshot.json"

if (local_or_cloud == "local"):
    json_path: str = f"{snapshot_paths['json_snapshot']}{json_filename}"

    with open(file=json_path, mode="w", encoding='utf-8') as json_fh:
        json.dump(json_data, json_fh)
elif (local_or_cloud == "cloud"):
    pass    # TODO