In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle as pkl
import matplotlib.pyplot as plt
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from scipy.ndimage import gaussian_filter
from scipy.spatial import KDTree

sys.setrecursionlimit(10000)
%matplotlib inline
init_notebook_mode(connected=True)

pd.options.mode.chained_assignment = None # turn off SettingWithCopyWarning

# Load Parcel Data

In [2]:
names = ["event_id", "user_id", "location_id", "lon", "lat", "epoch_time", "day_id", "seconds_since_monday"]

# USER SPECIFIC IMPORT
# directory = "C:\\Users\\alexj\\Documents\\Research\\twitter\\parcel_ass\\oc\\"  # alex
# directory = os.path.join("~", "dev", "CSAFE", "spatial-assocr", "data", "parcel_ass", "oc")  # chris
# events = pd.read_csv(directory + "ass_events.csv", header = None, names = names)
# data = pd.read_csv(os.path.join(directory, "ass_events_no_filter.csv"), header = None, names = names)

# GENERIC IMPORT (using files in the repo)
directory = os.path.join("..", "data")  # generic
data = pd.read_csv(os.path.join(directory, "parcel_ass-oc-ass_events_no_filter.csv"), header = None, names = names)
data.sort_values(['user_id', 'epoch_time'], inplace=True)
data.describe()

Unnamed: 0,event_id,user_id,location_id,lon,lat,epoch_time,day_id,seconds_since_monday
count,655917.0,655917.0,655917.0,655917.0,655917.0,655917.0,655917.0,655917.0
mean,6319007.0,792156200.0,241633.977727,-117.878281,33.740011,1441100000.0,114.426144,324717.87948
std,3652273.0,1001908000.0,191189.068282,0.09718,0.104993,5944586.0,68.802527,174493.396071
min,80.0,294.0,9.0,-118.11556,33.38826,1431313000.0,1.0,2.0
25%,3156263.0,54133320.0,61905.0,-117.92294,33.66589,1436230000.0,58.0,162146.0
50%,6301512.0,287673900.0,211459.0,-117.91191,33.7788,1440782000.0,111.0,330765.0
75%,9483557.0,1219463000.0,409421.0,-117.83749,33.81101,1446231000.0,174.0,481287.0
max,12725550.0,4724669000.0,661044.0,-117.44693,33.94595,1452279000.0,244.0,604798.0


# Visits  

Moshe created visits by _replac[ing] tweets
occurring with the same hour and within 50 meters of each
other with a single effective tweet._ 

Note that the `epoch_time` column contains timestamps (not sure the timezone) with a milisecond resolution. We will use that column to create visits in a similar fashion that also incorporates the location of the tweet.

In [3]:
d = 0.05  # 50 m
t = 60**2  # number of s in one hr

def spherical_dist(pos1, r=6371):
    """
    return distance matrix in km
    
    https://stackoverflow.com/questions/19413259/efficient-way-to-calculate-distance-matrix-given-latitude-and-longitude-data-in
    """
    pos2 = np.array(pos1)
    pos1 = np.array(pos1)[:, None]
    pos1 = np.deg2rad(pos1)
    pos2 = np.deg2rad(pos2)
    cos_lat1 = np.cos(pos1[..., 0])
    cos_lat2 = np.cos(pos2[..., 0])
    cos_lat_d = np.cos(pos1[..., 0] - pos2[..., 0])
    cos_lon_d = np.cos(pos1[..., 1] - pos2[..., 1])
    return r * np.arccos(cos_lat_d - cos_lat1 * cos_lat2 * (1 - cos_lon_d))

def temporal_dist(times):
    """matrix of time differences"""
    times = np.array(times)
    return np.abs(np.subtract.outer(times, times))

def invert_dict(d): 
    """reverse a dictionary with list/set values"""
    inverse = dict() 
    for key in d: 
        for item in d[key]:
            inverse[item] = key
    return inverse

def set_visits(df):
    """
    :param df: pd.DataFrame for one user with <lat, lon, epoch_time> columns
    :return: same pd.DataFrame with a visit column containing integer visit id
    """
    # find groups of points that meet both the dist and time criteria for a visit
    distance_match = spherical_dist(df[['lat', 'lon']]) <= d
    time_match = temporal_dist(df['epoch_time']) <= t
    matches = distance_match & time_match
    # groups = [[] for __ in range(len(matches))]
    # for row_i, row in enumerate(matches):
    #     v = set([i for i, is_true in enumerate(row) if is_true])
    #     groups[row_i] = v
    groups = {}
    i = 0
    for row_i, row in enumerate(matches):
        v = list([i for i, is_true in enumerate(row) if is_true])
        if v not in groups.values():
            groups[i] = v
            i += 1
    df['visit'] = invert_dict(groups).values()
    return df

### Apply to the entire data set & save result

In [6]:
pop = data.groupby('user_id').apply(set_visits)
pop_deduped = data.drop_duplicates(['user_id', 'location_id', 'visit'])

pop.to_csv(os.path.join(directory, "parcel_ass-oc-visits-ass_events_no_filter.csv"), index=False)
pop_deduped.to_csv(os.path.join(directory, "parcel_ass-oc-visits-ass_events_deduped.csv"), index=False)


In [7]:
pop.shape

(655917, 9)

In [8]:
pop_deduped.shape

(545697, 9)

## Format & filter for analysis

In [48]:
def sequential_filter_data(df, time_span='month', n_k=30):
    """
    Filter to users with at least n_k events in 2 sequential time_spans
    
    df: pd.DataFrame containing {user_id, day_id } columns 
    time_span: str in {'week', 'month', 'bimonth'}
    n_k: number of events a user must have in each time_span
    """
    t_vals = {
        'week': 7,
        'month': 30,
        'bimonth': 60,
    }
    t = t_vals.get(time_span)
    if t is None:
        return print("Enter a valid time_span... {'week', 'month', 'bimonth'}")

    # reset day id to start at 0
    df["day_id"] = df["day_id"] - df["day_id"].min()
    
    # calculate time_used ids
    df[time_span] = data.day_id // t
    
    # calc nevents in each time_span for each user & filter by min threshold
    counts = df.groupby(["user_id", time_span]).count()["event_id"]
    view = counts[counts >= n_k].reset_index()
    
    # join together to have a row for each user for a single time_span joined 
    # with the prior and next time_span
    lagged_view = pd.concat(
        [view, view.shift(1), view.shift(2)], 
        axis = 1
    )
    lagged_view.columns = list("next_" + view.columns) + \
        list(view.columns) + list("last_" + view.columns)
    
    # filter rows so that they only contain pairs where there is a sequential time_span present
    right_users = (lagged_view["user_id"] == lagged_view["next_user_id"]) | \
        (lagged_view["user_id"] == lagged_view["last_user_id"])
    sequential = ((lagged_view["next_"+time_span] - lagged_view[time_span]) == 1) | \
        ((lagged_view[time_span] - lagged_view["last_"+time_span]) == 1)
    filtered = lagged_view[right_users & sequential][["user_id", time_span, "event_id"]].reset_index(drop=True)
    
    # gather ids and valid weeks for the users that have survived the filtering
    user_time_dict = {-1 : []}
    for i, x in filtered.iterrows():
        user_id = x["user_id"]
        time = x[time_span]
        if user_id not in user_time_dict:
            user_time_dict[user_id] = []
        user_time_dict[user_id].append(time)
        
    # reduce the original data to be only valid events
    out = pd.merge(left=filtered, right=df, on=["user_id", time_span], how="left")    
    
    return out, user_time_dict


def create_mpp(df: pd.DataFrame, user_dict: dict):
    """create a dataframe contianing the data to be analyzed"""
    # get the user_ids that met the criteria in the first 2 months
    keep = []
    for uid, months in user_dict.items():
        if all(x in months for x in [0., 1.]):
            keep.append(uid)

    # create the mpp data frame
    mpp = df.loc[(df['user_id'].isin(keep)) & (df['month'].isin({0., 1.})),].reset_index()
    mpp['m'] = np.where(mpp['month'] == 0., 'a', 'b')
    
    # remap user id for easier handling
    user_dict = {}
    index = 1
    for ident in mpp.user_id.unique():
        if ident not in user_dict:
            user_dict[ident] = index
            index += 1

    mpp["new_user_id"] = mpp.user_id.apply(lambda x: user_dict[x])
    
    return mpp[['user_id', 'new_user_id', 'm', 'lon', 'lat', 'location_id']].rename(index=str, columns={"user_id": "old_uid", "new_user_id": "uid"}), keep


In [49]:
time_span = 'month'
n_visits = 20

filtered_visits, filtered_users = sequential_filter_data(
    pop_deduped, time_span=time_span, n_k=n_visits
)
print("FILTERED VISITS")
print("Users: ", filtered_visits['user_id'].nunique())
print("Visits:", len(filtered_visits))

FILTERED VISITS
Users:  544
Visits: 123781


In [50]:
mpp, keep = create_mpp(filtered_visits, filtered_users)
mpp.to_csv(os.path.join(directory, "mpp_visits_month0a_month1b_n20.csv"), index=False)
print("MPP DATA (at least {} visits in each of the first two {}s)".format(n_visits, time_span))
print("Users: ", mpp['uid'].nunique())
print("Visits:", len(mpp))


MPP DATA (at least 20 visits in each of the first two months)
Users:  223
Visits: 28052


In [51]:
n_users = mpp['uid'].nunique()
print("Number of events")
print("----------------")
print("Month 1: ", sum(mpp.m == 'a'), "({} per user)".format(round(sum(mpp.m == 'a')/n_users, 1)))
print("Month 2: ", sum(mpp.m == 'b'), "({} per user)".format(round(sum(mpp.m == 'b')/n_users, 1)))


Number of events
----------------
Month 1:  14654 (65.7 per user)
Month 2:  13398 (60.1 per user)


In [52]:
mpp.head()

Unnamed: 0,old_uid,uid,m,lon,lat,location_id
0,2554741.0,1,a,-117.83335,33.74851,546927.0
1,2554741.0,1,a,-117.67779,33.47102,517182.0
2,2554741.0,1,a,-117.83513,33.76453,543461.0
3,2554741.0,1,a,-117.70361,33.46636,130291.0
4,2554741.0,1,a,-117.64253,33.44296,500205.0


In [53]:
mpp.describe()

Unnamed: 0,old_uid,uid,lon,lat,location_id
count,28052.0,28052.0,28052.0,28052.0,28052.0
mean,1179048000.0,128.918473,-117.866609,33.729012,295309.355483
std,1112455000.0,65.778755,0.104304,0.103459,182011.081528
min,2554741.0,1.0,-118.11096,33.40559,906.0
25%,137553400.0,72.0,-117.92895,33.64697,111024.0
50%,748959900.0,136.0,-117.89021,33.72636,296304.0
75%,2377573000.0,191.0,-117.79469,33.81205,435556.0
max,3167795000.0,223.0,-117.56081,33.94497,661032.0


# CMP Computation

In [55]:
def get_sample_space(df, user_id):
    # user's location set is all unique parcels visited regardless of month
    loc_set = df.loc[mpp['uid'] == user_id]['location_id'].unique()
    
    # sample space is any other users' data in either month
    samp_sp = df.loc[(df.uid != user_id)]

    # loop over users in sample space counting number of overlapped parcels
    matches = []
    for u in samp_sp.uid.unique():
        tmp = samp_sp.loc[samp_sp.uid == u]
        shared_locs = np.intersect1d(
            loc_set,
            tmp['location_id'].unique()
        )
        if len(shared_locs) > 0:
            matches.append(
                {
                    'uid': u,
                    'n_matches': len(shared_locs),
                    'n_events': len(tmp)
                }
            )
    matches = pd.DataFrame(matches)

    # compute the weight for each matching user's points
    tot_matches = sum(matches['n_matches'])
    matches['w'] = matches['n_matches'] / (tot_matches  * matches['n_events'])
    matches.drop(['n_events', 'n_matches'], axis=1, inplace=True)

    # limit the sample space
    samp_sp = pd.merge(samp_sp, matches, on='uid')
    print('USER', user_id)
    print('Number of unique locations:', len(loc_set))
    print('Number of matched users:', samp_sp.uid.nunique())
    print('Number of sample points:', len(samp_sp))
    print('Sum of weights:', round(sum(samp_sp.w), 2))
    
    return samp_sp

samp_sp = get_sample_space(mpp, 1)

USER 1
Number of unique locations: 31
Number of matched users: 11
Number of sample points: 1253
Sum of weights: 1.0
