# Importing Libraries

In [190]:
import json
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from geopy.distance import vincenty
from matplotlib.collections import PatchCollection
from IPython.display import Image
warnings.filterwarnings('ignore')

# Importing the JSON file

In [191]:
with open('location_history.json', 'r') as fh:
    data = json.loads(fh.read())
data = data['locations']

# Data-Preprocessing

In [192]:
def get_normalized_activities(activities):
    data = list()
    for activity in activities:
        temp = list()
        for act in activity['activity']:
            temp.append(act)
        total = sum([value['confidence'] for value in temp])
        temp = [{'type':value['type'],'confidence':value['confidence']/total} for value in temp]
        data += temp
    return data

def get_date(time):
    day = time.day
    hour = time.hour

    day_of_week = time.weekday()
    day_of_week_index = [0,1,2,3,4,5,6]
    day_of_week_name = ['Mon','Tues','Wed','Thurs','Fri','Sat','Sun']
    index = day_of_week_index.index(day_of_week)
    day_of_week = day_of_week_name[index]

    year = time.year

    month = time.month
    month_index = [1,2,3,4,5,6,7,8,9,10,11,12]
    month_name = ['Jan','Feb','March','April','May','June','July','Aug','Sep','Oct','Nov','Dec']
    index = month_index.index(month)
    month = month_name[index]
    
    return {'day':day, 'hour':hour,'day_of_week':day_of_week, 'month':month, 'year':year}


def remove_wrong_data(data):
    degrees_to_radians = np.pi/180.0
    data_new = list()
    for index in range(len(data)):
        longitude = data[index]['longitudeE7']/float(1e7)
        if longitude > 4.768718 and longitude < 4.968718:
            data_new.append(data[index])
    return data_new

def get_data(data):
    data = remove_wrong_data(data)
    data = sorted(data,key = lambda x: x['timestampMs'])
    for index in range(len(data)):
        data[index]['latitudeE7']  = data[index]['latitudeE7']/float(1e7)
        data[index]['longitudeE7'] = data[index]['longitudeE7']/float(1e7)
        data[index]['timestampMs'] = float(data[index]['timestampMs'])/1000
        data[index]['datetime'] = datetime.datetime.fromtimestamp(data[index]['timestampMs'])
        dates = get_date(data[index]['datetime'])
        data[index]['day'] = dates['day']
        data[index]['day_of_week'] = dates['day_of_week']
        data[index]['month'] = dates['month']
        data[index]['year'] = dates['year']
        data[index]['hour'] = dates['hour']
        if 'activity' in data[index].keys():
            data[index]['activity'] = get_normalized_activities(data[index]['activity'])
        if index != 0:
            lat1 = data[index-1]['latitudeE7']
            lat2 = data[index]['latitudeE7']
            long1 = data[index-1]['longitudeE7']
            long2 = data[index]['longitudeE7']
            coords_1 = (lat1,long2)
            coords_2 = (lat2,long2)
            data[index]['distance'] = vincenty(coords_1,coords_2).km
    del data[0]
    return data

def get_new_data(data):
    data_new = list()
    for dictionary in data:
        if 'activity' in dictionary.keys() and dictionary['accuracy'] > 0 and dictionary['accuracy'] < 1000:
            for activity in dictionary['activity']:
                data_new.append({
                    'latitude':dictionary['latitudeE7'],
                    'longitude':dictionary['longitudeE7'],
                    'accuracy':dictionary['accuracy'],
                    'type':activity['type'],
                    'confidence':activity['confidence'],
                    'day':dictionary['day'],
                    'hour':dictionary['hour'],
                    'day_of_week':dictionary['day_of_week'],
                    'month':dictionary['month'],
                    'year':dictionary['year'],
                    'distance':dictionary['distance'],
                    'normalized_distance':dictionary['distance'] * activity['confidence']
                })
    return data_new

In [193]:
a = get_data(data)
a = get_new_data(a)
a = pd.DataFrame(a)
a = a[a.distance.notnull()]
#a['city'] = a.apply(lambda x: get_city(x['latitude'],x['longitude']),axis = 1)

In [194]:
a.head()

Unnamed: 0,accuracy,confidence,day,day_of_week,distance,hour,latitude,longitude,month,normalized_distance,type,year
0,13,0.301205,28,Tues,0.000667,13,45.777913,4.878723,Aug,0.000201,ON_FOOT,2018
1,13,0.301205,28,Tues,0.000667,13,45.777913,4.878723,Aug,0.000201,WALKING,2018
2,13,0.114458,28,Tues,0.000667,13,45.777913,4.878723,Aug,7.6e-05,STILL,2018
3,13,0.072289,28,Tues,0.000667,13,45.777913,4.878723,Aug,4.8e-05,UNKNOWN,2018
4,13,0.042169,28,Tues,0.000667,13,45.777913,4.878723,Aug,2.8e-05,IN_VEHICLE,2018


In [195]:
def get_city(latitude_real,longitude_real):
    latitude = latitude_real
    longitude = longitude_real
    if  longitude > 4.768718 and longitude < 4.968718:
        if latitude > 45.664158 and latitude < 45.823741 :
            return 'Lyon'
            print(1)
    else:
        return 'Other'
        print(2)

      
a['city'] = a.apply(lambda x: get_city(x['latitude'],x['longitude']),axis = 1)

Let's pickle the file so that we can start directly without re-running the pre-processing steps every time we run the Jupyter Notebook

In [197]:
import pickle
pickle_out = open("data.sav","wb")
pickle.dump(a, pickle_out)
pickle_out.close()

In [198]:
import pickle
pickle_in = open("data.sav","rb")
data = pickle.load(pickle_in)

In [199]:
data.describe()

Unnamed: 0,accuracy,confidence,day,distance,hour,latitude,longitude,normalized_distance,year
count,66501.0,66501.0,66501.0,66501.0,66501.0,66501.0,66501.0,66501.0,66501.0
mean,264.767808,0.205816,15.36231,0.099821,14.255485,45.777781,4.821809,0.017704,2018.0
std,348.481185,0.309744,9.334952,0.281081,5.084127,0.007636,0.049315,0.094945,0.0
min,3.0,0.0,1.0,0.0,0.0,45.737819,4.768719,0.0,2018.0
25%,13.0,0.03125,7.0,0.0,11.0,45.777612,4.770698,0.0,2018.0
50%,26.0,0.067114,14.0,0.000634,15.0,45.779833,4.80466,2.9e-05,2018.0
75%,600.0,0.1875,24.0,0.036334,18.0,45.782489,4.878539,0.002627,2018.0
max,991.0,1.0,31.0,3.94527,23.0,45.81168,4.912948,2.951749,2018.0


In [187]:
data.to_csv("test.csv", sep='\t')