In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from scipy.ndimage import gaussian_filter
%matplotlib inline
init_notebook_mode(connected=True)

# Load Data

In [2]:
directory = "C:\\Users\\alexj\\Documents\\Research\\twitter\\parcel_ass\\oc\\"
names = ["event_id", "user_id", "location_id", "lon", "lat", "epoch_time", "day_id", "seconds_since_monday"]
#events = pd.read_csv(directory + "ass_events.csv", header = None, names = names)
data = pd.read_csv(directory + "ass_events_no_filter.csv", header = None, names = names)

In [3]:
data.describe()

Unnamed: 0,event_id,user_id,location_id,lon,lat,epoch_time,day_id,seconds_since_monday
count,655917.0,655917.0,655917.0,655917.0,655917.0,655917.0,655917.0,655917.0
mean,6319007.0,792156200.0,241633.977727,-117.878281,33.740011,1441100000.0,114.426144,324717.87948
std,3652273.0,1001908000.0,191189.068282,0.09718,0.104993,5944586.0,68.802527,174493.396071
min,80.0,294.0,9.0,-118.11556,33.38826,1431313000.0,1.0,2.0
25%,3156263.0,54133320.0,61905.0,-117.92294,33.66589,1436230000.0,58.0,162146.0
50%,6301512.0,287673900.0,211459.0,-117.91191,33.7788,1440782000.0,111.0,330765.0
75%,9483557.0,1219463000.0,409421.0,-117.83749,33.81101,1446231000.0,174.0,481287.0
max,12725550.0,4724669000.0,661044.0,-117.44693,33.94595,1452279000.0,244.0,604798.0


In [4]:
# round lat, long pairs to fall into discrete bins
data["lat_r"] = data.lat.round(3)
data["lon_r"] = data.lon.round(3)

In [5]:
# calculate counts for heatmap
freqs = data.groupby(['lat_r', 'lon_r']).count().reset_index()[["lat_r", "lon_r", "event_id"]]
freqs.columns = ["lat_r", "lon_r", "freq"]

In [6]:
# heatmap table
p_freqs = freqs.pivot_table(columns="lon_r", index="lat_r", values="freq", fill_value=0)

In [7]:
# remap user id for easier handling
user_dict = {}
index = 0
for ident in data.user_id.unique():
    if ident not in user_dict:
        user_dict[ident] = index
        index += 1

data["new_user_id"] = data.user_id.apply(lambda x: user_dict[x])

# reset day id to start at 0
data["day_id"] = data["day_id"] - data['day_id'].min()

In [8]:
# calculate week and month ids
data["week"] = data.day_id // 7
data['month'] = data.day_id // 30

# determine if you want to plot based on weeks or months
time_used = "week" # "month"

In [9]:
# calculate the number of events within each time period per user
counts2 = data.groupby(["new_user_id", time_used]).count()["event_id"]

In [10]:
# filter based on minimum amount of events per time period (k)
k = 35 #100
view = counts2[counts2 > k].reset_index()

In [11]:
# join together to have a row for each user for a single week joined with the prior and next week
lagged_view = pd.concat([view, view.shift(1), view.shift(2)], axis = 1)
lagged_view.columns = list("next_" + view.columns) + list(view.columns) + list("last_" + view.columns)

In [12]:
# filter rows so that they only contain pairs where there is a sequential week present
right_users = (lagged_view["new_user_id"] == lagged_view["next_new_user_id"]) | (lagged_view["new_user_id"] == lagged_view["last_new_user_id"])
sequential = ((lagged_view["next_"+time_used] - lagged_view[time_used]) == 1) | ((lagged_view[time_used] - lagged_view["last_"+time_used]) == 1)
filtered = lagged_view[right_users & sequential][["new_user_id", time_used, "event_id"]].reset_index(drop=True)
filtered

Unnamed: 0,new_user_id,week,event_id
0,1.0,0.0,204.0
1,1.0,1.0,205.0
2,1.0,2.0,211.0
3,1.0,3.0,250.0
4,1.0,4.0,243.0
5,1.0,5.0,304.0
6,1.0,6.0,146.0
7,1.0,7.0,237.0
8,1.0,8.0,293.0
9,1.0,9.0,297.0


In [13]:
# gather ids and valid weeks for the users that have survived the filtering
user_time_dict = {-1 : []}
for i, x in filtered.iterrows():
    user_id = x["new_user_id"]
    time = x[time_used]
    if user_id not in user_time_dict:
        user_time_dict[user_id] = []
    user_time_dict[user_id].append(time)

In [14]:
# reduce the original data to be only valid events
reduced_view = pd.merge(left=filtered, right=data, on=["new_user_id", time_used], how="left")

In [15]:
# number of weeks available for analysis
sum([len(v) for k,v in user_time_dict.items()])

830

In [16]:
def get_user(id, time):
    return reduced_view[(reduced_view["new_user_id"] == id) & (reduced_view[time_used] == time)].reset_index(drop=True)

def make_user_scatter_plot(id, time):
    user = get_user(id, time)
    name = "{}-{}{}-{}".format(id, time_used[0], int(time), len(user))
    return go.Scatter(
        x = user.lon,
        y = user.lat,
        mode = 'markers',
        name = name,
        visible = "legendonly"
    )

def plot_figure(user_id):
    heat = go.Heatmap(
        z = gaussian_filter(p_freqs.values, sigma = 0.5).tolist(),
        x = p_freqs.columns,
        y = p_freqs.index,
        colorscale = [
            [0, 'rgb(0, 0, 0)'],        #0
            [1./10000, 'rgb(0, 0, 0)'],
            [1./10000, 'rgb(100, 100, 100)'], #10
            [1./1000, 'rgb(130, 130, 130)'],  #100
            [1./100, 'rgb(170, 170, 170)'],   #1000
            [1./10, 'rgb(220, 220, 220)'],       #10000
            [1., 'rgb(255, 255, 255)'],             #100000

        ],
        colorbar = {
            'tick0': 0,
            'tickmode': 'array',
            'tickvals': [0, 1000, 10000, 100000]
        }
    )

    traces = [heat]
    traces += [make_user_scatter_plot(user_id, time) for time in user_time_dict[user_id]]

    layout = go.Layout(
        legend=dict(x=-.35, y=1)
    )
    fig = go.Figure(data=traces, layout=layout)

    iplot(fig, show_link=False)

In [17]:
#### heatmap only
# -1
#
#### examples
# w: 1358 - concentrated
# w: 907  - very concentrated
# w: 1121 - scattered
# w: 1296 - very concentrated
# w: 13   - clustered w/ outliers
# m: 1409 - identical overlays
# m: 1587 - some clustering
# m: 1725 - extreme clustering
plot_figure(1358)