In [1]:
import pandas as pd 
import numpy as np 
import os
import json
import re
from tqdm import tqdm
from pprint import pprint
import geopandas
import pyproj

In [2]:
tqdm.pandas(desc="Progress")

In [3]:
# For the file directory assumptions to work correctly, you will need the contents of parler_posts_json.zip extracted to the directory parler_posts_json located in the same directory as this file.
cwd = os.getcwd()
posts_dir = cwd + '/data/parler_posts_json/'
meta_dir = cwd + '/data/metadata/'

In [4]:
frames = [
 'GPSPosition',
 'FileTypeExtension',
]

In [13]:
def data_iterator(path):
    dumbs = []
    for record in tqdm(os.listdir(path)):
        if record.endswith('.json'):
            output = {'file': record}
            try:
                with open(meta_dir + record) as f:
                    data = json.load(f)[0]
                for frame in frames:
                    if frame in data:
                        output[frame] = data[frame]
                        if frame == 'FileTypeExtension':
                            output['mediafile'] = output['file'][5:-5] + '.' + data[frame]
                if len(output.keys()) > 1:
                    for date in ['CreateDate', 'ModifyDate']:
                        if date in data:
                            if not data[date] == '0000:00:00 00:00:00':
                                output[date] = data[date]
                    yield output
            except:
                print(f'** File {record} could not be handled for some dumb reason')
                dumbs.append(record)
metadata = pd.DataFrame(data_iterator(meta_dir))
metadata

100%|██████████| 1032524/1032524 [00:53<00:00, 19356.89it/s]


Unnamed: 0,file,FileTypeExtension,mediafile,CreateDate,ModifyDate,GPSPosition
0,meta-ISVeh218verI.json,mp4,ISVeh218verI.mp4,,,
1,meta-BeLykBcFbpEW.json,mp4,BeLykBcFbpEW.mp4,,,
2,meta-qQqSthmGdpSn.json,mp4,qQqSthmGdpSn.mp4,2020:07:08 15:20:03,2020:07:08 15:20:03,
3,meta-Aswvxy3XDKg8.json,mp4,Aswvxy3XDKg8.mp4,2020:07:21 03:48:04,2020:07:21 03:48:05,
4,meta-VCnhpqvdrlT2.json,mp4,VCnhpqvdrlT2.mp4,,,
...,...,...,...,...,...,...
1032518,meta-Z9XbGQZ0yfN1.json,mov,Z9XbGQZ0yfN1.mov,2020:07:22 07:58:37,2020:07:22 07:58:37,
1032519,meta-00PAgCovyAN5.json,mp4,00PAgCovyAN5.mp4,,,
1032520,meta-6e5v9mh7xmUW.json,mp4,6e5v9mh7xmUW.mp4,2020:12:14 02:48:08,2020:12:14 02:48:08,
1032521,meta-Bjo29hOVVjxP.json,mp4,Bjo29hOVVjxP.mp4,,,


In [14]:
# Get the best date.  Already checked a ton of possible datetime fields and nothing gives better data than CreateDate, or if CreateDate is missing, ModifyDate.
dates = metadata[['CreateDate', 'ModifyDate']]
clean_createdate = (pd.to_datetime(metadata.CreateDate, errors='coerce', format='%Y:%m:%d %H:%M:%S'))
dates['CreateDate'] = clean_createdate
clean_modifydate = (pd.to_datetime(metadata.ModifyDate, errors='coerce', format='%Y:%m:%d %H:%M:%S'))
dates['ModifyDate'] = clean_modifydate
dates.CreateDate.fillna(dates.ModifyDate, inplace=True)
metadata[['datetime']] = dates[['CreateDate']]

# Convert the coordinates to decimal lat/long as floats.  Out of all geotagged data fields, GPSPosition always had the right data in the right format.
geocoords = metadata[metadata['GPSPosition'].notna()]
long_geo = re.compile(r'(\d{1,3}) deg (\d{1,2})\' (\d{1,2}\.\d{0,2})\" (\w), (\d{1,3}) deg (\d{1,2})\' (\d{1,2}\.\d{0,2})\" (\w)')

def long_to_decimal(coord):
    m = long_geo.match(coord)
    if m:
        latD, latM, latS, latC, lonD, lonM, lonS, lonC = m.groups()
        if latC == "N":
            latSign = 1
        elif latC == "S":
            latSign = -1
        else:
            print(f'{latC} is the thing that should not be!')
        if lonC == "E":
            lonSign = 1
        elif lonC == "W":
            lonSign = -1
        else:
            print(f'{lonC} is the thing that should not be!')
        return (latSign * (int(latD) + int(latM) / 60 + float(latS) / 3600)), (lonSign * (int(lonD) + int(lonM) / 60 + float(lonS) / 3600))

new_coords = geocoords.GPSPosition.map(long_to_decimal)
metadata[['lat', 'lon']] = pd.DataFrame(new_coords.tolist(), index=new_coords.index)

#put file names into a string because they're not somehow?
metadata['filename'] = metadata['file'].astype('string')
metadata.dtypes


metadata['medianame'] = metadata['mediafile'].astype('string')
metadata.dtypes




file                         object
FileTypeExtension            object
mediafile                    object
CreateDate                   object
ModifyDate                   object
GPSPosition                  object
datetime             datetime64[ns]
lat                         float64
lon                         float64
filename                     string
medianame                    string
dtype: object

In [15]:
focus = metadata[['filename', 'medianame', 'datetime', 'lat', 'lon']]
focus.dropna(inplace=True)
focus

Unnamed: 0,filename,medianame,datetime,lat,lon
12,meta-27PknKIOwHt6.json,27PknKIOwHt6.mov,2020-10-31 23:02:29,34.0724,-118.4026
48,meta-m3Wq53jjPnpw.json,m3Wq53jjPnpw.mov,2020-12-21 23:22:36,34.0473,-118.2599
69,meta-kpKT3stt5LXq.json,kpKT3stt5LXq.mov,2020-12-04 20:49:20,26.3340,-80.2949
100,meta-qgU5924Q8jQh.json,qgU5924Q8jQh.mov,2020-11-04 07:29:46,33.9827,-118.2094
103,meta-XfeRIAoVLtei.json,XfeRIAoVLtei.mov,2020-07-02 12:50:02,-22.3546,-42.3357
...,...,...,...,...,...
1032448,meta-JMfLmefCzvMW.json,JMfLmefCzvMW.mov,2020-06-30 07:03:29,-35.2165,138.5386
1032456,meta-33oluYHSIrDn.json,33oluYHSIrDn.mov,2020-07-17 09:44:45,33.8139,-117.5453
1032501,meta-pMdvwJuktYPj.json,pMdvwJuktYPj.mov,2020-12-12 19:53:48,38.8907,-77.0058
1032510,meta-50A0Fl2Fcg89.json,50A0Fl2Fcg89.mov,2020-11-25 21:05:29,49.1741,-122.6799


In [16]:
focus = focus.reset_index()
focus

Unnamed: 0,index,filename,medianame,datetime,lat,lon
0,12,meta-27PknKIOwHt6.json,27PknKIOwHt6.mov,2020-10-31 23:02:29,34.0724,-118.4026
1,48,meta-m3Wq53jjPnpw.json,m3Wq53jjPnpw.mov,2020-12-21 23:22:36,34.0473,-118.2599
2,69,meta-kpKT3stt5LXq.json,kpKT3stt5LXq.mov,2020-12-04 20:49:20,26.3340,-80.2949
3,100,meta-qgU5924Q8jQh.json,qgU5924Q8jQh.mov,2020-11-04 07:29:46,33.9827,-118.2094
4,103,meta-XfeRIAoVLtei.json,XfeRIAoVLtei.mov,2020-07-02 12:50:02,-22.3546,-42.3357
...,...,...,...,...,...,...
68281,1032448,meta-JMfLmefCzvMW.json,JMfLmefCzvMW.mov,2020-06-30 07:03:29,-35.2165,138.5386
68282,1032456,meta-33oluYHSIrDn.json,33oluYHSIrDn.mov,2020-07-17 09:44:45,33.8139,-117.5453
68283,1032501,meta-pMdvwJuktYPj.json,pMdvwJuktYPj.mov,2020-12-12 19:53:48,38.8907,-77.0058
68284,1032510,meta-50A0Fl2Fcg89.json,50A0Fl2Fcg89.mov,2020-11-25 21:05:29,49.1741,-122.6799


In [17]:
focus.to_feather(os.getcwd() + '/metadata.feather')