In [127]:
import json
import pandas as pd
from twarc.expansions import flatten

def json_to_pandas(file: object) -> pd.DataFrame:
    """
    Converts a json file output by the Twarc search command line method into a pandas dataframe
    :param file: Filepath of file outputted by Twarc
    :return: pd.Dataframe A combined dataframe of all tweets
    """
    def flatten_json(json_object):
        with open(json_object, "r") as infile:
            data = [flatten(json.loads(line)) for line in infile]

        return data

    return pd.concat(list(map(pd.DataFrame.from_dict, flatten_json(file)))).reset_index(drop=True)

In [None]:
def dict_to_cols(df, col, prefix=""):
    tmp = pd.json_normalize(df[col]).add_prefix(prefix)
    df.drop(columns=[col], inplace=True)
    return pd.concat([df, tmp], axis=1)

In [152]:
def transform_data(path_):
    nested_cols = [
        {'col': 'attachments', 'prefix': 'attachment_'},
        {'col': 'entities', 'prefix': 'entities_'},
        {'col': 'public_metrics', 'prefix': 'public_metrics_'},
        {'col': 'author', 'prefix': 'author_'},
        {'col': '__twarc', 'prefix': '__twarc_'},
        {'col': 'in_reply_to_user', 'prefix': 'in_reply_to_user_'},
        {'col': 'referenced_tweets', 'prefix': 'referenced_tweets_'}
    ]

    df = json_to_pandas(path_)
    df['referenced_tweets'] = df['referenced_tweets'].apply(lambda x: x[0] if isinstance(x, list) else x)
    df.drop(columns=['context_annotations'], inplace=True)
    for col in nested_cols:
        df = dict_to_cols(df, col['col'], col['prefix'])
    return df

In [157]:
import os
dfs = []
for file in os.listdir('./data/tweets_m'):
    print(os.path.join('data/tweets_m', file))
    dfs.append(transform_data(os.path.join('./data/', file)))

data/tweets_m/stream.jsonl
data/tweets_m/green.json
data/tweets_m/steam2.jsonl
data/tweets_m/sustainable.json
data/tweets_m/stream1.jsonl
data/tweets_m/climate.json


In [159]:
pd.concat(dfs)

Unnamed: 0,author_id,conversation_id,created_at,geo,id,in_reply_to_user_id,lang,possibly_sensitive,reply_settings,source,...,referenced_tweets_geo.full_name,referenced_tweets_geo.geo.type,referenced_tweets_geo.geo.bbox,referenced_tweets_geo.id,referenced_tweets_geo.name,referenced_tweets_geo.place_type,referenced_tweets_attachments.poll_ids,referenced_tweets_withheld.copyright,referenced_tweets_withheld.country_codes,referenced_tweets_author.withheld.country_codes
0,453228928,1493696019955167235,2022-02-16T13:48:29.000Z,{},1493945378919653379,1398357037545820167,en,False,everyone,Twitter for Android,...,,,,,,,,,,
1,133814603,1493945376453402627,2022-02-16T13:48:29.000Z,{},1493945381956329478,133814603,en,False,everyone,Twitter Web App,...,,,,,,,,,,
2,1235695675175821312,1493945383529025544,2022-02-16T13:48:30.000Z,{},1493945383529025544,,en,False,everyone,SocialRabbit Plugin,...,,,,,,,,,,
3,21150492,1493918236835995652,2022-02-16T13:48:30.000Z,{},1493945386012233730,21150492,en,False,everyone,Twitter Web App,...,,,,,,,,,,
4,1416815310217359365,1493490191767830531,2022-02-16T13:48:32.000Z,{},1493945391833915394,1404944124906770432,en,False,everyone,Twitter for Android,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8261,152459817,1493962976273047558,2022-02-16T14:58:24.000Z,"{'place_id': '2635a5d0aa51ce86', 'country': 'U...",1493962976273047558,,en,False,everyone,Twitter for iPhone,...,,,,,,,,,,
8262,1275156115051950081,1493962978802163714,2022-02-16T14:58:25.000Z,{},1493962978802163714,,en,False,everyone,Twitter for Android,...,,,,,,,,,,
8263,6534322,1493962973076992000,2022-02-16T14:58:24.000Z,{},1493962977308983302,6534322,en,False,everyone,Twitter Web App,...,,,,,,,,,,
8264,906977492027809792,1493962977866924037,2022-02-16T14:58:25.000Z,{},1493962977866924037,,en,False,everyone,Twitter for Android,...,,,,,,,,,,
