# Setting up the code

The following just defined the functions to be used.  You could put them in a file, or use them right here.

In [6]:
import pandas as pd
import json, glob, re, os, time


#do some stuff
def convert_directory(input_dir,output_dir):
    # Make sure the output directory will work
    try:
        os.makedirs(output_dir,exist_ok=True)
    except:
        pass
        #Ignore the error
    
    pattern = "{}/*.json".format(input_dir)
    files_to_process = [f for f in glob.glob(pattern)]
    print("{} files to process...".format(len(files_to_process)))
    t = time.process_time()
    for f in files_to_process:
        parse_file(f,output_dir)
    print("Done: {}".format(time.process_time() - t))
    
    
def parse_chunked_file(infile,output_dir):
    # Just get the original name (no extension)
    name = infile.split("/")[-1].split(".json")[0]
    output = "./{}/{}.csv".format(output_dir,name)
    print("Processing {} -> {}".format(infile,output))
    with open(infile,"r") as f:
        data=f.read()
        # I noticed that the individual files have multiple json objects,
        # which causes a json parser to break.  So, I'm doing a little manual
        # clearning before trying to parse the json
        chunks = data.split('{"data":')
        frames = []
        for i in range(len(chunks)):
            if (len(chunks[i])==0): 
                continue   
            end = chunks[i].rfind("]")+1
            
            # By this point, I've just lifted out the list element following the "data" key
            # in the top level dictionary
            chunk = chunks[i][:end]
            
            # Use pandas to read the json
            frames.append(chunk)

        # Use pandas to write the csv out
        pd.concat(frames).to_csv(output)


# This is a separate function for doing this on just one file
def parse_proper_file(infile,output_dir):
     # Just get the original name (no extension)
    name = infile.split("/")[-1].split(".json")[0]
    output = "./{}/{}.csv".format(output_dir,name)
    try:
        os.makedirs(output_dir,exist_ok=True)
    except:
        pass
        #Ignore the error
    with open(infile,"r") as f:
        data=json.load(f)
        print(data.keys())
        df = pd.read_json(json.dumps(data['data']))
        colnames = df.columns.to_list()
        colnames.remove("tweet_created_at")
        colnames.insert(0,"tweet_created_at")
        df = df.reindex(columns = colnames)
        df = df.sort_values("tweet_created_at")
        df.to_csv(output)
        return df
    
    



# Usage example

Put all the JSON files you want to convert in the first directory, and the function will take care of creating the second directory if it doesn't already exist

In [3]:
convert_directory("../../data","./output")

4 files to process...
Processing ../../data/2234952261.json -> ././output/2234952261.csv
Processing ../../data/2249123298.json -> ././output/2249123298.csv
Processing ../../data/2238431106.json -> ././output/2238431106.csv
Processing ../../data/2246392007.json -> ././output/2246392007.csv
Done: 1.0567970000000002


In [4]:
pd.__version__

'1.0.0'

In [5]:
os.listdir("../../data")

['2234952261.json',
 '2249123298.json',
 '2238431106.json',
 '1280942408.json',
 '2246392007.json']

In [202]:
x = parse_proper_file("./tweetjson006.json","./output")
x

dict_keys(['data', 'user', 'ref_tweet', 'errors'])


Unnamed: 0,tweet_created_at,tweet_id,tweet_text,tweet_author_id,tweet_conversation_id,tweet_entities,tweet_public_metrics,tweet_referenced_tweets_id,tweet_referenced_tweets_type,tweet_in_reply_to_user_id
42864,2019-04-05 19:26:26+00:00,1114247944058867713,This is my first tweet. Don't use twitter much...,1011008340367953920,1114247944058867713,"{'hashtags': [{'start': 90, 'end': 97, 'tag': ...","{'retweet_count': 0, 'reply_count': 1, 'like_c...",,,
42863,2019-04-26 16:09:51+00:00,1121808620428120066,Second tweet all time - you should buy my book...,1011008340367953920,1121808620428120066,"{'urls': [{'start': 48, 'end': 71, 'url': 'htt...","{'retweet_count': 2, 'reply_count': 0, 'like_c...",,,
42862,2019-05-24 23:27:05+00:00,1132065512429506561,really enjoyed this convo with @GlennLoury - h...,1011008340367953920,1132065512429506561,"{'mentions': [{'start': 31, 'end': 42, 'userna...","{'retweet_count': 1, 'reply_count': 1, 'like_c...",,,
42861,2019-05-24 23:29:14+00:00,1132066052488278016,Kudos to @MrAndyNgo for this share of my work ...,1011008340367953920,1132066052488278016,"{'mentions': [{'start': 9, 'end': 19, 'usernam...","{'retweet_count': 1, 'reply_count': 0, 'like_c...",1099797854149795840,quoted,
42860,2019-05-24 23:32:49+00:00,1132066953315672064,"The atheists LOVE ME, SON!!! Good discussion o...",1011008340367953920,1132066953315672064,"{'mentions': [{'start': 101, 'end': 115, 'user...","{'retweet_count': 1, 'reply_count': 0, 'like_c...",,,
...,...,...,...,...,...,...,...,...,...,...
4,2022-02-16 20:55:03+00:00,1494052729576120321,"Snipe hunt, like I always do on the seventy-le...",1011008340367953920,1494052729576120321,"{'urls': [{'start': 62, 'end': 85, 'url': 'htt...","{'retweet_count': 1, 'reply_count': 2, 'like_c...",1494015142463479811,quoted,
3,2022-02-16 21:30:20+00:00,1494061609286881281,Only one caveat: you can sub any heterodox gro...,1011008340367953920,1494061609286881281,"{'urls': [{'start': 190, 'end': 213, 'url': 'h...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",1494056710515867650,quoted,
2,2022-02-16 22:33:29+00:00,1494077501336866818,Never do an enemy a small injury. https://t.co...,1011008340367953920,1494077501336866818,"{'urls': [{'start': 34, 'end': 57, 'url': 'htt...","{'retweet_count': 2, 'reply_count': 1, 'like_c...",1494019836502056961,quoted,
1,2022-02-16 22:40:23+00:00,1494079237912940550,"""If you fight me, I will turn your nation into...",1011008340367953920,1494079237912940550,,"{'retweet_count': 3, 'reply_count': 9, 'like_c...",,,


# Newer file format

Since May 22, files are being collected as well-formed json documents.  Easiest to parse these with pandas - some unpacking will need to happen at some point

In [22]:
import pandas as pd
df = pd.read_json("../../../shared_data/47339454_CNN.json.gz")

In [23]:
df.columns

Index(['author_id', 'text', 'created_at', 'entities', 'public_metrics',
       'conversation_id', 'id', 'referenced_tweets', 'in_reply_to_user_id'],
      dtype='object')

In [24]:
df[['id','text','created_at','author_id']].to_csv("../../../shared_data/47339454_CNN.csv",index=False)

In [26]:
df = pd.read_csv("../../../shared_data/47339454_CNN.csv.gz")


In [27]:
df

Unnamed: 0,id,text,created_at,author_id
0,1516446792346484736,"On page 135 of 467 of Wundersmith, by Jessica ...",2022-04-19 16:01:04+00:00,47339454.0
1,1516230968440410112,RT @LamaSuryaDas: Be Buddha now. Practice bein...,2022-04-19 01:43:27+00:00,47339454.0
2,1516007170306940928,3 of 5 stars to Aprende Línguas com o Peter by...,2022-04-18 10:54:10+00:00,47339454.0
3,1515998492136726528,4 of 5 stars to Morte na Aldeia by Caroline Gr...,2022-04-18 10:19:41+00:00,47339454.0
4,1515671811219992576,RT @TaylorRMarshall: Jesus Christ became a bab...,2022-04-17 12:41:34+00:00,47339454.0
...,...,...,...,...
11993,2205424680,"Klustout, I congratulate you on your work, whi...",2009-06-17 11:42:34+00:00,47339454.0
11994,2205338701,Just noticed Robin Sharma's sentence from last...,2009-06-17 11:31:13+00:00,47339454.0
11995,2204827239,I am getting inspired by Robin Sharma's senten...,2009-06-17 10:14:08+00:00,47339454.0
11996,2193774003,I am so new to this twittery world that I stil...,2009-06-16 15:54:48+00:00,47339454.0
