Monday=0, Sunday=6

In [66]:
from collections import Counter

import pandas as pd
import numpy as np

import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.express as px

from sentiance.location import Location, decode_geohash

In [67]:
Location(50.951220, 4.889917, 34).lat_accuracy()

(152.6178855895996, 0.0006866455078125)

In [68]:
NBITS = 30

In [69]:
filepath = "../data/Copy of person.3.csv"
df = pd.read_csv(filepath,sep=";").rename(columns={
    "start_time(YYYYMMddHHmmZ)":"start_time",
    "duration(ms)": "duration"})

In [70]:
df['start_time'] = pd.to_datetime(df['start_time'], format='%Y%m%d%H%M%z')

In [71]:
df["date"] = df['start_time'].apply(lambda x: x.date())

In [72]:
df["dayofweek"] = df['start_time'].apply(lambda x: x.dayofweek)

In [73]:
df["hourofday"] = df['start_time'].apply(lambda x: x.hour)

In [74]:
df["hour_duration"] = df['duration'].apply(lambda x: x/(3600000))

In [75]:
def compute_geohash(row, nbits=NBITS):
    loc = Location(row["latitude"], row["longitude"], nbits)
    #print(loc.lat_accuracy()[0], loc.long_accuracy()[0])
    return loc.encode()
    
df["geohash"] = df.apply(compute_geohash, axis=1)

In [76]:
#min_dt, max_dt = df["start_time"].min().to_pydatetime(), df["start_time"].max().to_pydatetime()
weekday_counter = Counter()
unique_days = df["date"].unique()
n_unique_days = len(unique_days)
for dt in unique_days:
    weekday_counter[dt.weekday()] += 1

In [77]:
#df["day_frequency"] = 0
#df["mean_duration"] = 0

MIN_DAY_DENSITY = 0.5
MIN_DURATION = 1
MAX_DIST = 500

hash_df = []
for geohash, group in df.groupby("geohash"):
    record = {
        "geohash": geohash,
        "day_frequency": len(group["date"].unique())/n_unique_days,
        "mean_duration": group["hour_duration"].mean(),
        "mean_starthour": group["hourofday"].mean(),
        "mean_lat": group["latitude"].mean(),
        "mean_lng": group["longitude"].mean()
    }
    
    week_hist = Counter()
    for date, g in group.groupby("date"):
        try:
            weekday = date.weekday()
        except AttributeError:
            print(group)
        week_hist[weekday] += 1/weekday_counter[weekday]
          
    record["number_of_days"] = len([day for day, density in week_hist.items() if density>MIN_DAY_DENSITY])
    
    hash_df.append(record)

hash_df = pd.DataFrame(hash_df)

In [89]:
mean_frequency = np.mean(hash_df["day_frequency"].unique())
print(mean_frequency)
filtered_df = hash_df[(hash_df["mean_duration"]>=MIN_DURATION) & (hash_df["day_frequency"]>mean_frequency)]

0.23030303030303026


In [90]:
filtered_df

Unnamed: 0,geohash,day_frequency,mean_duration,mean_starthour,mean_lat,mean_lng,number_of_days
33,873631719,0.872727,5.652873,11.795699,50.95122,4.889917,7
65,873633003,0.363636,2.888783,11.733333,51.206871,4.388041,2
68,873633007,0.327273,5.551806,9.95,51.216007,4.393897,0
77,873633364,0.472727,1.939019,14.948718,51.231156,4.403964,3


In [96]:
distances = []
for i in range(filtered_df.shape[0]):
    rec1 = filtered_df.iloc[i,:].to_dict()
    loc1 = Location(rec1["mean_lat"], rec1["mean_lng"], NBITS)
    for j in range(i+1,filtered_df.shape[0]):
        rec2 = filtered_df.iloc[j,:].to_dict()
        loc2 = Location(rec2["mean_lat"], rec2["mean_lng"], NBITS)
        dist = loc1.distance(loc2)
        distances.append({
            "src": rec1["geohash"],
            "dst": rec2["geohash"],
            "dist": dist
        })

In [97]:
distances = pd.DataFrame(distances)
distances

Unnamed: 0,src,dst,dist
0,873631719.0,873633003.0,45.136361
1,873631719.0,873633007.0,45.467837
2,873631719.0,873633364.0,46.051493
3,873633003.0,873633007.0,1.094721
4,873633003.0,873633364.0,2.919173
5,873633007.0,873633364.0,1.824505


In [98]:
dist1 = distances[["dst","dist"]].rename(columns={"dst":"src"})
dist2 = distances[["src","dist"]]
distances = pd.concat((dist1,dist2), axis=0)

In [99]:
distances

Unnamed: 0,src,dist
0,873633003.0,45.136361
1,873633007.0,45.467837
2,873633364.0,46.051493
3,873633007.0,1.094721
4,873633364.0,2.919173
5,873633364.0,1.824505
0,873631719.0,45.136361
1,873631719.0,45.467837
2,873631719.0,46.051493
3,873633003.0,1.094721


In [100]:
distances = distances.groupby("src").agg(np.mean)

In [101]:
distances

Unnamed: 0_level_0,dist
src,Unnamed: 1_level_1
873631719.0,45.551897
873633003.0,16.383418
873633007.0,16.129021
873633364.0,16.931724


In [84]:
geohash_toremove = set(distances[distances["dist"]>=MAX_DIST].index)

In [63]:
filtered_df=filtered_df[~filtered_df["geohash"].isin(geohash_toremove)]

In [64]:
filtered_df[filtered_df["number_of_days"]>=6][["mean_lat","mean_lng"]]

Unnamed: 0,mean_lat,mean_lng
33,50.95122,4.889917


In [65]:
filtered_df[filtered_df["number_of_days"]<6][["mean_lat","mean_lng"]]

Unnamed: 0,mean_lat,mean_lng
65,51.206871,4.388041
68,51.216007,4.393897
77,51.231156,4.403964
