<a href="https://colab.research.google.com/github/fromakim/2021Election_Analysis/blob/main/data_prepreocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Dev Environment

In [1]:
!pip install flatten_json



In [2]:
from google.colab import drive

import itertools
import json
import requests
from urllib import parse
from datetime import datetime, timedelta

In [3]:
import numpy as np
import pandas as pd

In [4]:
from flatten_json import flatten

In [5]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# 2. Import Data

In [6]:
# Import Data
data = []

In [7]:
for i in range(100):
    try:
        f = open(f'./gdrive/MyDrive/Colab Notebooks/election_sample/data{i:02}.json', encoding = 'utf-8')
        data.append(json.load(f))
    except Exception as e:
        pass

# 3. Organize User, Media, Tweet

In [8]:
users = pd.concat([pd.DataFrame([flatten(i, root_keys_to_ignore = {'public_metrics', 'entities'}) for i in item['includes']['users']]) for item in data]).reset_index(drop = True)

In [9]:
medias = pd.concat([pd.DataFrame([flatten(i) for i in item['includes']['media']]) if 'media' in item['includes'] else pd.DataFrame() for item in data]).reset_index(drop = True)

In [10]:
tweets = pd.concat([pd.DataFrame([flatten(i, root_keys_to_ignore = {'referenced_tweets', 'public_metrics', 'in_reply_to_user_id', 'attachments', 'entities', 'context_annotations', 'attachments'}) for i in item['includes']['tweets']]) for item in data]).reset_index(drop = True)

# 4. Organize overall Relation

In [11]:
user_relations = pd.concat([pd.json_normalize(d['includes']['users']) for d in data]).reset_index(drop = True)[['id', 'entities.url.urls', 'entities.description.urls', 'entities.description.mentions', 'entities.description.hashtags']]

In [12]:
tweet_relations = pd.concat([pd.json_normalize(d['includes']['tweets']) for d in data])[['referenced_tweets', 'entities.urls', 'in_reply_to_user_id']].reset_index(drop = True)

In [13]:
# https://developer.twitter.com/en/docs/twitter-api/users/lookup/api-reference/get-users
user_relations.columns

Index(['id', 'entities.url.urls', 'entities.description.urls',
       'entities.description.mentions', 'entities.description.hashtags'],
      dtype='object')

In [14]:
# https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/api-reference/get-tweets
tweet_relations.columns

Index(['referenced_tweets', 'entities.urls', 'in_reply_to_user_id'], dtype='object')

# 5. Organize User Relations

In [15]:
relation_url_url = user_relations[['id', 'entities.url.urls']].dropna().explode('entities.url.urls')
relation_url_url = pd.concat([relation_url_url, relation_url_url['entities.url.urls'].apply(pd.Series)], axis = 1)
relation_url_url = relation_url_url.drop(columns = 'entities.url.urls').reset_index(drop = True)

In [16]:
relation_url_description = user_relations[['id', 'entities.description.urls']].dropna().explode('entities.description.urls')
relation_url_description = pd.concat([relation_url_description, relation_url_description['entities.description.urls'].apply(pd.Series)], axis = 1)
relation_url_description = relation_url_description.drop(columns = 'entities.description.urls').reset_index(drop = True)

In [17]:
relation_url = pd.concat([relation_url_url, relation_url_description]).reset_index(drop = True)

In [18]:
relation_mention = user_relations[['id', 'entities.description.mentions']].dropna().explode('entities.description.mentions')
relation_mention = pd.concat([relation_mention, relation_mention['entities.description.mentions'].apply(pd.Series)], axis = 1)
relation_mention = relation_mention.drop(columns = 'entities.description.mentions').reset_index(drop = True)

In [19]:
relation_hashtag = user_relations[['id', 'entities.description.hashtags']].dropna().explode('entities.description.hashtags')
relation_hashtag = pd.concat([relation_hashtag, relation_hashtag['entities.description.hashtags'].apply(pd.Series)], axis = 1)
relation_hashtag = relation_hashtag.drop(columns = 'entities.description.hashtags').reset_index(drop = True)

# 6. Save Data Files

In [20]:
users.to_csv('./gdrive/MyDrive/Colab Notebooks/election_sample/user.json')
medias.to_csv('./gdrive/MyDrive/Colab Notebooks/election_sample/media.json')
tweets.to_csv('./gdrive/MyDrive/Colab Notebooks/election_sample/tweet.json')

In [21]:
relation_mention.to_csv('./gdrive/MyDrive/Colab Notebooks/election_sample/mention.json')

In [22]:
relation_hashtag.to_csv('./gdrive/MyDrive/Colab Notebooks/election_sample/hashtag.json')