In [2]:
import datetime
import json

import arrow
from bs4 import BeautifulSoup
import pandas as pd
import requests

import photo as pto
import podcast as pc
import twitter as twit

# Functions to manage the archive file



pd.options.display.max_columns = 30

def get_archive():
    '''Loads archive.json, or creates a new blank files structure.'''

    # Load the archive file
    try:
        with open('archive.json', 'r') as file:
            archive = json.loads(file.read())
        print('Archive loaded.')

    except:
        # Create archive data structure.

        archive = {'meta': {'ids': [],
                            'audio': [],
                            'tw_ids': [],
                            'last_updated': 'New File',
                            },
                   'data': [],
                   }
        print('No archived file, building new archive.json')

    return archive


def save_archive(archive):
    '''Saves to archive.json.'''

    with open('archive.json', 'w') as file:
        archive['meta']['last_updated'] = str(arrow.now())
        file.write(json.dumps(archive, indent=4))

    return True


def get_tweet_archive(record='tweets'):
    '''Utility function to load the tweet archvive'''

    filename = record + '.json'

    try:
        with open(filename, 'r') as file:
            archive = json.loads(file.read())
        print('tweets_loaded.')

    except:

        print('No archived tweet file: ', filename,
              ' build a new file with build_twitter_archive function')

    return archive


def backfill_dates(archive):
    '''Helper function to convert timestamps
    to a date. Date is used to match audio files to photos.'''

    for walk in archive['data']:

        timestamp = walk['timestamp']

        date = timestamp.split(' ')[0]

        walk['date'] = date

    return archive


def sort_archive_data(archive):
    '''Sorts the data portion of archvie by date'''
    
    df = pd.DataFrame(archive['data'])
    df.sort_values('date', inplace=True)
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    sorted_data = df.to_dict('records')
    archive['data'] = sorted_data
    
    return archive


In [12]:
archive = get_archive()
full_keys = list(archive['data'][-1].keys())

set_of_keys = {'link',
 'condition',
 'tag',
 'temp',
 'sun',
 'raw_title',
 'photo_id',
 'thumb',
 'camera',
 'timestamp',
 'date',
 'user',
 'fullname',
 'url',
 'text',
 'replies',
 'retweets',
 'likes',
 'tw_title',
 'photo_link',
 'wind',
 'walk',
 'tw_id',
 'tw_timestamp'} # output of full_keys hardcoded into a set.

Archive loaded.


In [4]:
archive.keys()

dict_keys(['meta', 'data'])

In [7]:
archive['meta']['last_updated']

'2019-04-20T10:30:44.312229+00:00'

In [22]:
partial_records = []

for record in archive['data']:
    if len(set_of_keys) > len(record.keys()):
        partial_records.append(record)

In [23]:

len(partial_records)

1105

In [27]:
partial_dates = []

for item in partial_records:
    partial_dates.append(item['date'])

len(set(partial_dates))

1100

In [39]:
import pandas as pd
pd.options.display.max_columns = 30

def sort_archive_data(archive):
    '''Sorts the data portion of archvie by date'''
    
    df = pd.DataFrame(archive['data'])
    df.sort_values('date', inplace=True)
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    sorted_data = df.to_dict('records')
    archive['data'] = sorted_data
    
    return archive

In [40]:
df.sort_values('date', inplace=True)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
sorted_data = df.to_dict('records')

In [55]:
archive['data'][50]

{'camera': 'BlackBerry 9000',
 'condition': 'light breeze',
 'date': '2011-06-16',
 'fullname': 'Nathan Lowell',
 'likes': 1.0,
 'link': 'https://c1.staticflickr.com/3/2744/5838701745_1b899c9ecc_o.jpg',
 'photo_id': '5838701745',
 'photo_link': 'http://twitpic.com/5caxbx',
 'raw_title': 'All sizes | #tommw 62F ptly cloudy, light breeze | Flickr - Photo Sharing!',
 'replies': 0.0,
 'retweets': 0.0,
 'sun': 'ptly cloudy',
 'tag': '#tommw',
 'temp': '62',
 'text': '#tommw 62F ptly cloudy, light breeze http://twitpic.com/5caxbx',
 'thumb': 'images/5838701745.jpg',
 'timestamp': '2011-06-16 06:10:35',
 'tw_id': '81333156376096768',
 'tw_timestamp': '2011-06-16 12:11:50',
 'tw_title': '#tommw 62F ptly cloudy, light breeze http://twitpic.com/5caxbx',
 'url': '/nlowell/status/81333156376096768',
 'user': 'nlowell',
 'walk': 1.0,
 'wind': 'light breeze'}

In [61]:
save_archive(archive)

True