In [180]:
ABOUT__FACE_RECOGNITION = "about_you/face_recognition.json"
ABOUT__FRIEND_PEER_GROUP = "about_you/friend_peer_group.json"
ABOUT__ADDRESS_BOOK = "about_you/your_address_books.json"

ADS__INTERESTS = "ads/ads_interests.json"
ADS__UPLOADED_CONTACT_LIST = "ads/advertisers_who_uploaded_a_contact_list_with_your_information.json"
ADS__INTERACTED_WITH = "ads/advertisers_you've_interacted_with.json"

APPS_AND_WEBSITES = 'apps_and_websites/apps_and_websites.json'

COMMENTS = 'comments/comments.json'

EVENTS__EVENT_INVITATIONS = 'events/event_invitations.json'
EVENTS__YOUR_EVENT_RESPONSES = 'events/your_event_responses.json'
EVENTS__YOUR_EVENTS = 'events/your_events.json'

FOLLOWS__FOLLOWED_PAGES = 'following_and_followers/followed_pages.json'
FOLLOWS__FOLLOWING = 'following_and_followers/following.json'
FOLLOWS__UNFOLLOWED_PAGES = 'following_and_followers/unfollowed_pages.json'

FRIENDS__FRIENDS = 'friends/friends.json'
FRIENDS__REJECTED_REQUESTS = 'friends/rejected_friend_requests.json'
FRIENDS__REMOVED = 'friends/removed_friends.json'
FRIENDS__SENT_REQUESTS = 'friends/sent_friend_requests.json'

GROUPS__MEMBERSHIP_ACTIVITY = 'groups/your_group_membership_activity.json'
GROUPS__ADMIN = 'groups/your_groups.json'
GROUPS__POSTS_AND_COMMENTS = 'groups/your_posts_and_comments_in_groups.json'

LIKES_AND_REACTIONS__PAGES = 'likes_and_reactions/pages.json'
LIKES_AND_REACTIONS__POSTS_AND_COMMENTS = 'likes_and_reactions/posts_and_comments.json'

# NOTE: Missing location data. This could be useful

MARKETPLACE__ITEMS_BOUGHT = 'marketplace/items_bought.json'
MARKETPLACE__ITEMS_SOLD = 'marketplace/items_sold.json'

#TODO: Figure out how to do messages
MESSAGES__SAMPLE = 'messages/inbox/33eee_ozf3zkkl9a/message_1.json'

# NOTE: This is missing one folder called `other_activity`.
# It contains pokes and other miscellanea
#TODO: Missing pages data. Do we need this?

PAYMENT_HISTORY = 'payment_history/payment_history.json'

#TODO: Missing photos and videos. Not sure who cares?

POSTS__OTHERS_PEOPLES_POSTS_TO_YOUR_TIMELINE = "posts/other_people's_posts_to_your_timeline.json"
POSTS__YOUR_POSTS = 'posts/your_posts.json'

#NOTE: This information is dense, and is likely best viewed as a raw JSON
PROFILE__PROFILE_INFO = 'profile_information/profile_information.json'
PROFILE__UPDATE_HISTORY = 'profile_information/profile_update_history.json'

SAVED_ITEMS_AND_COLLECTIONS = 'saved_items_and_collections/saved_items_and_collections.json'

SEARCH_HISTORY = 'search_history/your_search_history.json'

# There are some security login ones I didn't get around to doing

In [232]:
import pandas
from pandas.io.json import json_normalize
import json
from faker import Faker
import datetime
import time
import warnings

In [70]:
fake = Faker()
# Choose your own seed value.
# This ensures names will generate the same every time
fake.seed(25)
fakes_table = {}

In [308]:
def anonymized_json(json_data):
    print(json_data)
    return json_data

def _fake_name(input_name=None):
    name = fake.name()
    return _fake_table_lookup(input_name, name)

def _fake_email(input_email=None):
    email = fake.simple_profile()['mail']
    return _fake_table_lookup(input_email, email)

def _fake_title(input_title=None):
    max_len = max(5, (len(input_title) * 4)/3) if input_title else 15
    title = fake.text(max_nb_chars=max_len)[:-1].title()
    return _fake_table_lookup(input_title, title)

def _fake_text(input_text=None):
    max_len = max(5, (len(input_text) * 4)/3) if input_text else 24
    text = fake.text(max_nb_chars=max_len)
    return _fake_table_lookup(input_text, text)

def _fake_timestamp(input_datetime, end_datetime=None, start_datetime=None):
    if input_datetime == 0:
        return 0
    timestamp = int(time.mktime(fake.past_date().timetuple()))
    return _fake_table_lookup(input_datetime, timestamp)

def _fake_phone(input_phone):
    phone = fake.phone_number()
    return _fake_table_lookup(input_phone, phone)

def _fake_url(input_url):
    url = fake.url()
    return _fake_table_lookup(input_url, url)

def _fake_latitude(input_lat):
    lat = fake.latitude()
    return _fake_table_lookup(input_lat, lat)

def _fake_longitude(input_lon):
    lon = fake.longitude()
    return _fake_table_lookup(input_lon, lon)

def _fake_bool():
    return fake.boolean()

def _fake_table_lookup(key, value):
    if key is None:
        return value
    elif key not in fakes_table:
        fakes_table[key] = value
    else:
        value = fakes_table[key]
    return value

In [314]:
def write_cleaned_file(filename):
    with open('../facebook-data/' + filename) as f:
        data = json.load(f)
    with open('../datastructures.json') as d:
        cleaning_rules = json.load(d)[filename]
    _apply_rules_to_json(cleaning_rules, data)
    return data
    
def _apply_rules_to_json(rules, data={}):
    for key in data.keys():
        if key not in rules:
            warning = "Missing key: {})".format(key)
            print(warning)
#             warnings.warn(warning)
            return -1
            continue
        r_val = rules[key]
        d_val = data[key]
        x = 0
        if r_val is None:
            return        
        if isinstance(d_val, dict):
            if _apply_rules_to_json(r_val, d_val) == -1:
                print(key)
                return
        elif isinstance(d_val, list):
            for i in d_val:
                if isinstance(i, dict):                    
                    if _apply_rules_to_json(r_val[0], i) == -1:
                        print(key)
                        return
                else:
                    _apply_rule_to_value(r_val[0], None)
        else:
            data[key] = _apply_rule_to_value(r_val, d_val)

            
def _apply_rule_to_value(rule, value):
        if rule == "NAME":
            return _fake_name(value)
        elif rule == "TIMESTAMP":
            return _fake_timestamp(value)
        elif rule == "EMAIL":
            return _fake_email(value)
        elif rule == "PHONE":
            return _fake_phone(value)
        elif rule == "TITLE":
            return _fake_title(value)
        elif rule == "TEXT":
            return _fake_text(value)
        elif rule == "URL":
            return _fake_url(value)
        elif rule == "LAT":
            return _fake_latitude(value)
        elif rule == "LON":
            return _fake_longitude(value)
        elif rule == "BOOL":
            return _fake_bool()
        # Allow overriding for specific strings
        elif "CUSTOM:" in rule:
            return rule.replace('CUSTOM:','')
        # For documentation sake, write in which ones we manually tweak
        elif rule == "MANUAL":
            return
        else:
            warnings.warn("Unable to handle rule '{}'".format(rule))
            
data = write_cleaned_file(LIKES_AND_REACTIONS__POSTS_AND_COMMENTS)
data

{'reactions': [{'data': [{'reaction': {'actor': 'Billy Mcpherson',
      'reaction': None}}],
   'timestamp': 1557115200,
   'title': "You liked/loved/wowed Such and such's comment."},
  {'data': [{'reaction': {'actor': 'Billy Mcpherson', 'reaction': None}}],
   'timestamp': 1556942400,
   'title': "You liked/loved/wowed Such and such's comment."},
  {'data': [{'reaction': {'actor': 'Billy Mcpherson', 'reaction': None}}],
   'timestamp': 1557288000,
   'title': "You liked/loved/wowed Such and such's comment."},
  {'data': [{'reaction': {'actor': 'Billy Mcpherson', 'reaction': None}}],
   'timestamp': 1556769600,
   'title': "You liked/loved/wowed Such and such's comment."},
  {'data': [{'reaction': {'actor': 'Billy Mcpherson', 'reaction': None}}],
   'timestamp': 1558324800,
   'title': "You liked/loved/wowed Such and such's comment."},
  {'data': [{'reaction': {'actor': 'Billy Mcpherson', 'reaction': None}}],
   'timestamp': 1556683200,
   'title': "You liked/loved/wowed Such and such

In [250]:
def json_with_ignored_keys(filename):
    with open('../facebook-data/' + filename) as f:
        data = json.load(f)
    (ignored_keys, flatten_keys) = _keys_for_file(filename)
    for k in ignored_keys:
        data = data[k]
    return data

def json_as_dataframe(filename):
    data = json_with_ignored_keys(filename)
    (_, flatten_keys) = _keys_for_file(filename)
    # We want to flatten the `data` value a lot of these have    
    for d in data:
        d = _collapse_keys(d, flatten_keys)
                
    pd = pandas.DataFrame.from_dict(data)
    return json_normalize(data)


def _collapse_keys(row, collapse_keys):
    for ck in collapse_keys:
        if ck in row and isinstance(row[ck], list) and len(row[ck]) == 1:
            row[ck] = _collapse_keys(row[ck][0], collapse_keys)
    return row

def _keys_for_file(filename):
    ig_keys = []
    flatten_keys = []
    if filename is ABOUT__ADDRESS_BOOK:
        ig_keys = ['address_book', 'address_book']
        
    if filename is ADS__INTERACTED_WITH:
        ig_keys = ['history']
        
    if filename is APPS_AND_WEBSITES:
        ig_keys = ['installed_apps']
        
    if filename is COMMENTS:
        ig_keys = ['comments']
        flatten_keys = ['data']
        
    if filename is EVENTS__YOUR_EVENT_RESPONSES:
        # NOTE: this can be changed to `events_declined` or `events_interested`
        ig_keys = ['event_responses', 'events_joined'] 
    if filename is EVENTS__YOUR_EVENTS:
        ig_keys = ['your_events']
        
    if filename is FOLLOWS__FOLLOWED_PAGES:
        ig_keys = ['pages_followed']
        flatten_keys = ['data']
    if filename is FOLLOWS__FOLLOWING:
        ig_keys = ['following']
    if filename is FOLLOWS__UNFOLLOWED_PAGES:
        ig_keys = ['pages_unfollowed']
        flatten_keys = ['data']

    if filename is FRIENDS__FRIENDS:
        ig_keys = ['friends']
    if filename is FRIENDS__REJECTED_REQUESTS:
        ig_keys = ['rejected_requests']
    if filename is FRIENDS__REMOVED:
        ig_keys = ['deleted_friends']
    if filename is FRIENDS__SENT_REQUESTS:
        ig_keys = ['sent_requests']
        
    if filename is GROUPS__PENDING_POSTS:
        ig_keys = ['pending_posts']
    if filename is GROUPS__MEMBERSHIP_ACTIVITY:
        ig_keys = ['groups_joined']
        flatten_keys = ['attachments', 'data']
    if filename is GROUPS__ADMIN:
        ig_keys = ['groups_admined']
    if filename is GROUPS__POSTS_AND_COMMENTS:
        ig_keys = ['group_posts', 'activity_log_data']
        flatten_keys = ['data']

    if filename is LIKES_AND_REACTIONS__PAGES:
        ig_keys = ['page_likes']
    if filename is LIKES_AND_REACTIONS__POSTS_AND_COMMENTS:
        ig_keys = ['reactions']
        flatten_keys = ['data']
        
    if filename is MARKETPLACE__ITEMS_BOUGHT:
        ig_keys = ['items_buying']
    if filename is MARKETPLACE__ITEMS_SOLD:
        ig_keys = ['items_selling']
        
    if filename is MESSAGES__SAMPLE:
        ig_keys = ['messages']
        
    if filename is PAYMENT_HISTORY:
        ig_keys = ['payments', 'payments']
        
    if filename is POSTS__OTHERS_PEOPLES_POSTS_TO_YOUR_TIMELINE:
        ig_keys = ['wall_posts_sent_to_you']
        flatten_keys = ['data']
    if filename is POSTS__YOUR_POSTS:
        ig_keys = ['status_updates']
        flatten_keys = ['attachments', 'data'] 
        
    if filename is SAVED_ITEMS_AND_COLLECTIONS:
        ig_keys = ['saves_and_collections']
        flatten_keys = ['attachments', 'data'] 

    if filename is SEARCH_HISTORY:
        ig_keys = ['searches']
        flatten_keys = ['data']
        
    return (ig_keys, flatten_keys)

In [201]:
a = json_as_dataframe(APPS_AND_WEBSITES)
a.sample(n=1)

Unnamed: 0,added_timestamp,name
17,1477976626,Bumble
