In [1]:
import pandas as pd 
import numpy as np 
import os
import json
import re
from tqdm import tqdm
from pprint import pprint
import geopandas
import pyproj

In [2]:
tqdm.pandas(desc="Progress")

In [3]:
# For the file directory assumptions to work correctly, you will need the contents of parler_posts_json.zip extracted to the directory parler_posts_json located in the same directory as this file.
cwd = os.getcwd()
posts_dir = cwd + '/data/parler_posts_json/'
meta_dir = cwd + '/data/metadata/'

In [4]:
frames = [
 'GPSPosition',
]

In [5]:
def data_iterator(path):
    dumbs = []
    for record in tqdm(os.listdir(path)):
        if record.endswith('.json'):
            output = {'file': record}
            try:
                with open(meta_dir + record) as f:
                    data = json.load(f)[0]
                for frame in frames:
                    if frame in data:
                        output[frame] = data[frame]
                if len(output.keys()) > 1:
                    for date in ['CreateDate', 'ModifyDate']:
                        if date in data:
                            if not data[date] == '0000:00:00 00:00:00':
                                output[date] = data[date]
                    yield output
            except:
                print(f'** File {record} could not be handled for some dumb reason')
                dumbs.append(record)
metadata = pd.DataFrame(data_iterator(meta_dir))
metadata

100%|██████████| 1032524/1032524 [05:41<00:00, 3021.13it/s]


Unnamed: 0,file,GPSPosition,CreateDate,ModifyDate
0,meta-27PknKIOwHt6.json,"34 deg 4' 20.64"" N, 118 deg 24' 9.36"" W",2020:10:31 23:02:29,2020:10:31 23:03:28
1,meta-m3Wq53jjPnpw.json,"34 deg 2' 50.28"" N, 118 deg 15' 35.64"" W",2020:12:21 23:22:36,2020:12:21 23:22:42
2,meta-kpKT3stt5LXq.json,"26 deg 20' 2.40"" N, 80 deg 17' 41.64"" W",2020:12:04 20:49:20,2020:12:04 20:49:35
3,meta-qgU5924Q8jQh.json,"33 deg 58' 57.72"" N, 118 deg 12' 33.84"" W",2020:11:04 07:29:46,2020:11:04 07:29:46
4,meta-XfeRIAoVLtei.json,"22 deg 21' 16.56"" S, 42 deg 20' 8.52"" W",2020:07:02 12:50:02,2020:07:02 12:50:24
...,...,...,...,...
68441,meta-JMfLmefCzvMW.json,"35 deg 12' 59.40"" S, 138 deg 32' 18.96"" E",2020:06:30 07:03:29,2020:06:30 07:04:01
68442,meta-33oluYHSIrDn.json,"33 deg 48' 50.04"" N, 117 deg 32' 43.08"" W",2020:07:17 09:44:45,2020:07:17 09:45:19
68443,meta-pMdvwJuktYPj.json,"38 deg 53' 26.52"" N, 77 deg 0' 20.88"" W",2020:12:12 19:53:48,2020:12:12 19:55:30
68444,meta-50A0Fl2Fcg89.json,"49 deg 10' 26.76"" N, 122 deg 40' 47.64"" W",2020:11:25 21:05:29,2020:11:25 21:06:29


In [6]:
# Get the best date.  Already checked a ton of possible datetime fields and nothing gives better data than CreateDate, or if CreateDate is missing, ModifyDate.
dates = metadata[['CreateDate', 'ModifyDate']]
clean_createdate = (pd.to_datetime(metadata.CreateDate, errors='coerce', format='%Y:%m:%d %H:%M:%S'))
dates['CreateDate'] = clean_createdate
clean_modifydate = (pd.to_datetime(metadata.ModifyDate, errors='coerce', format='%Y:%m:%d %H:%M:%S'))
dates['ModifyDate'] = clean_modifydate
dates.CreateDate.fillna(dates.ModifyDate, inplace=True)
metadata[['datetime']] = dates[['CreateDate']]

# Convert the coordinates to decimal lat/long as floats.  Out of all geotagged data fields, GPSPosition always had the right data in the right format.
geocoords = metadata[metadata['GPSPosition'].notna()]
long_geo = re.compile(r'(\d{1,3}) deg (\d{1,2})\' (\d{1,2}\.\d{0,2})\" (\w), (\d{1,3}) deg (\d{1,2})\' (\d{1,2}\.\d{0,2})\" (\w)')

def long_to_decimal(coord):
    m = long_geo.match(coord)
    if m:
        latD, latM, latS, latC, lonD, lonM, lonS, lonC = m.groups()
        if latC == "N":
            latSign = 1
        elif latC == "S":
            latSign = -1
        else:
            print(f'{latC} is the thing that should not be!')
        if lonC == "E":
            lonSign = 1
        elif lonC == "W":
            lonSign = -1
        else:
            print(f'{lonC} is the thing that should not be!')
        return (latSign * (int(latD) + int(latM) / 60 + float(latS) / 3600)), (lonSign * (int(lonD) + int(lonM) / 60 + float(lonS) / 3600))

new_coords = geocoords.GPSPosition.map(long_to_decimal)
metadata[['lat', 'lon']] = pd.DataFrame(new_coords.tolist(), index=new_coords.index)

#put file names into a string because they're not somehow?
metadata['filename'] = metadata['file'].astype('string')
metadata.dtypes



file                   object
GPSPosition            object
CreateDate             object
ModifyDate             object
datetime       datetime64[ns]
lat                   float64
lon                   float64
filename               string
dtype: object

In [7]:
focus = metadata[['filename', 'datetime', 'lat', 'lon']]
focus.dropna(inplace=True)
focus

Unnamed: 0,filename,datetime,lat,lon
0,meta-27PknKIOwHt6.json,2020-10-31 23:02:29,34.0724,-118.4026
1,meta-m3Wq53jjPnpw.json,2020-12-21 23:22:36,34.0473,-118.2599
2,meta-kpKT3stt5LXq.json,2020-12-04 20:49:20,26.3340,-80.2949
3,meta-qgU5924Q8jQh.json,2020-11-04 07:29:46,33.9827,-118.2094
4,meta-XfeRIAoVLtei.json,2020-07-02 12:50:02,-22.3546,-42.3357
...,...,...,...,...
68441,meta-JMfLmefCzvMW.json,2020-06-30 07:03:29,-35.2165,138.5386
68442,meta-33oluYHSIrDn.json,2020-07-17 09:44:45,33.8139,-117.5453
68443,meta-pMdvwJuktYPj.json,2020-12-12 19:53:48,38.8907,-77.0058
68444,meta-50A0Fl2Fcg89.json,2020-11-25 21:05:29,49.1741,-122.6799


In [8]:
focus = focus.reset_index()
focus

Unnamed: 0,index,filename,datetime,lat,lon
0,0,meta-27PknKIOwHt6.json,2020-10-31 23:02:29,34.0724,-118.4026
1,1,meta-m3Wq53jjPnpw.json,2020-12-21 23:22:36,34.0473,-118.2599
2,2,meta-kpKT3stt5LXq.json,2020-12-04 20:49:20,26.3340,-80.2949
3,3,meta-qgU5924Q8jQh.json,2020-11-04 07:29:46,33.9827,-118.2094
4,4,meta-XfeRIAoVLtei.json,2020-07-02 12:50:02,-22.3546,-42.3357
...,...,...,...,...,...
68281,68441,meta-JMfLmefCzvMW.json,2020-06-30 07:03:29,-35.2165,138.5386
68282,68442,meta-33oluYHSIrDn.json,2020-07-17 09:44:45,33.8139,-117.5453
68283,68443,meta-pMdvwJuktYPj.json,2020-12-12 19:53:48,38.8907,-77.0058
68284,68444,meta-50A0Fl2Fcg89.json,2020-11-25 21:05:29,49.1741,-122.6799


In [9]:
focus.to_feather(os.getcwd() + '/metadata.feather')