### Let's import all the useful libraries in the world. So we don't get stuck with an ugly error later on!

In [3]:
import pandas as pd
from geopy.distance import vincenty
import geohash
import numpy as np
import csv
import operator
import collections
import datetime
from datetime import datetime
from collections import defaultdict

### With that out of the way, the next step is to read data from the masterfile! 
(Which has latitude, longitude, PersonID, altitude, elevation -- all the data that we have collected).

We add a column 'ts' with the date-time of the row.

In [4]:
df = pd.read_csv('masterfile.csv')
df['ts'] = pd.to_datetime(df.time)

### Some pre-processing to do: 
* Sort all the ID's in place. so all the rows with the same ID's occur together. GroupBy would also work?
* How many days do we have data for Person x? [Create Dictionary of Persons mapped with Number of Days of Data collected]
* Round off all the timestamps to 5min intervals

In [5]:
df.sort(columns=['ID'], inplace=True)
names=df['ID'].unique().tolist()

people = {}
for n in names:
    select = df.loc[df['ID']==n]
    people[n] = len(select['Day'].unique())

  if __name__ == '__main__':


In [6]:
def round_to_5min(t):
    delta = pd.datetools.timedelta(minutes=t.minute%5, 
                               seconds=t.second, 
                               microseconds=t.microsecond)
    t -= delta
    if delta >= pd.datetools.timedelta(minutes=2.5):
        t += pd.datetools.timedelta(minutes=5)
    return t

In [7]:
df['ts10'] = df['ts'].apply(lambda x: round_to_5min(x))

### Now let's delete all those Persons who have less than 5 days of data. 

We don't need you!

In [8]:
drop_ids = [p for p in people if people[p] <= 5]
df = df.loc[~(df['ID'].isin(drop_ids))]

### Let's begin with the real deal.
* def midpoint(p1,p2): calculates midpoints between p1:(lat,lon) and p2:(lat,lon). outputs p3:(lat,lon).
* def cij(piv,i,j,thrs=30): calculates the distance between person i and j for every datapoint. If the distance is <30m, it is counted as an encounter

In [9]:
def midpoint(p1, p2):
    lat1,lon1 = p1 # JFK
    lat2,lon2 =  p2# SIN

    # Compute path from 1 to 2
    g = Geodesic.WGS84.Inverse(lat1, lon1, lat2, lon2);

    # Compute midpoint starting at 1
    h1 = Geodesic.WGS84.Direct(lat1, lon1, g['azi1'], g['s12']/2);
    # Alternatively, compute midpoint starting at 2
    h2 = Geodesic.WGS84.Direct(lat2, lon2, g['azi2'], -g['s12']/2);
    return (h2['lat2'],h2['lon2'])

In [10]:
def cij(piv, i,j, thrs=30):
    '''
    Generate the cij i and j
    '''
    #encounters
    i_lat = piv['latitude'][i]
    i_long = piv['longitude'][i]
    j_lat = piv['latitude'][j]
    j_long = piv['longitude'][j]
    i_point = zip(i_lat, i_long)
    j_point = zip(j_lat, j_long)
    ## I am going to hell for this long comprehension, should the mean accuracy also be taken care of
    ## next iteration.
    dist = [vincenty(i_point[i], j_point[i]).meters  if pd.notnull(i_point[i][0]) and pd.notnull(j_point[i][0]) else 9999 for i in range(0, len(i_point))]
    encounters = [1 if x<=thrs else 0 for x in dist]
    return encounters

In [11]:
trial = df[['ts10','ID','lat','lon','accuracy']].drop_duplicates(subset=['ts10','ID'])

In [12]:
piv = pd.pivot_table(trial, index='ts10', columns='ID' ,values = ['lat','lon','accuracy'])

In [13]:
piv = piv.asfreq('5Min', method=None)

In [14]:
piv.groupby(pd.TimeGrouper('D')).count().tail()

Unnamed: 0_level_0,lat,lat,lat,lat,lat,lat,lat,lat,lat,lat,...,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
ID,1rishabhtrivedi,adithyapsv,anshumanagrwl,anuj7chauhan,arch.b80,arkg1996,bhagyeshvikani,bhardwaj.rish,coolsush89,desai.deshna,...,rudra.chandak,sacheendra.t,sagarparikh31,samriddhisimlai,saumyadoshi,shaleen.k.gupta,tany.dudett,umangjparmar,vaibhav29.07.97,vraj2026
ts10,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2016-04-30,288,181,0,189,4,0,288,288,253,0,...,285,0,1,0,282,0,286,231,143,240
2016-05-01,253,285,0,237,16,2,288,277,288,163,...,287,0,0,0,229,0,242,208,254,146
2016-05-02,0,208,0,208,22,0,224,35,288,288,...,280,0,0,0,207,0,48,246,114,149
2016-05-03,0,277,0,189,26,2,288,288,202,288,...,280,0,0,0,275,0,272,232,0,210
2016-05-04,0,205,0,250,2,0,277,206,284,277,...,282,0,0,0,167,0,235,119,0,33


In [15]:
import itertools
ts = piv.index.tolist()

m = [list(g) for k, g in itertools.groupby(ts, key=lambda d: d.date())]
dates = list(set(map(lambda t: t.date(),ts)))

In [34]:
def encounter_data(piv, i,j):
    '''
    Generates a global list of all encounters that have happened in the dataset.
    Information need about each encounter as of now
     - timestamp
     - middle point and consequently the geohash
     - the pair
    
    '''
    #global tot_timestamps,tot_lats,tot_longs,tot_geohash,tot_pairs, tot_encounters
    original_i = i
    original_j = j
    
    #encounters
    i_lat = piv['lat'][i]
    i_long = piv['lon'][i]
    j_lat = piv['lat'][j]
    j_long = piv['lon'][j]
    #i_point = zip(i_lat, i_long)
    #j_point = zip(j_lat, j_long)
    day_enc = defaultdict(list)
    #lists = m[10]
    #print i_lat.ix[lists]
    #check if using sections like this (i_lat.ix[lists]) produces speedup
    
    # Might improve performance to pivot by index (Day, Timestamp)
    for i in range(0,len(m)):
        num_datapts = [1 if pd.notnull(i_lat.ix[k]) and pd.notnull(j_lat.ix[k]) else 0 for k in m[i]]
        #m[i] = ith day.
        #BUG: 30% of total data points in a day? Then day is not discarded.
        if sum(num_datapts)>=60:
            # probably faster if indexing points is done as a separate step.
            # check if zip is needed.
            dist = [vincenty((i_lat.ix[k],i_long.ix[k]), (j_lat.ix[k],j_long.ix[k])).meters  if pd.notnull(i_lat.ix[k]) and pd.notnull(j_lat.ix[k]) else 9999 for k in m[i]]
            encounters = 0
            for x in dist:
                if x<=30:
                    encounters += 1
            day_enc[m[i][0]] = encounters
    #stop = datetime.now()
    #print "time taken to calculate encounters for ",original_i,"and ",original_j," is ",(stop-start)
        
    total_encounters = 0
    num_days = 0
    for k,v in day_enc.iteritems():
        total_encounters = total_encounters + v
        num_days += 1
        
    if num_days!=0:
        mean = float(total_encounters)/float(num_days)
    else:
        mean = 0
    median = np.median(day_enc.values())
        
    with open('encounter_master_thrsh_30_new.csv','ab') as f:
        writer = csv.writer(f)
        writer.writerow([original_i,original_j,median,mean,total_encounters,num_days])
    
    return

In [35]:
import itertools
count = 0
distance = pd.DataFrame()
temp = ''
l = []
keys = df.ID.unique()

with open('encounter_master_thrsh_30_new.csv','wb') as f:
    writer = csv.writer(f)
    writer.writerow(['Person_1','Person_2','Median','Mean','Total_encounters','Num_days'])
    
for subset in itertools.combinations(keys, 2):
    i = subset[0]
    j = subset[1]
    if temp != i:
        temp = i
    start = datetime.now()
    encounter_data(piv, i, j)
    stop = datetime.now()
    print "time taken to calculate encounters for ",i,"and ",j," is ",(stop-start)
        
print "Encounter Calculations Complete."

time taken to calculate encounters for  1rishabhtrivedi and  adithyapsv  is  0:00:01.406872
time taken to calculate encounters for  1rishabhtrivedi and  anshumanagrwl  is  0:00:01.068747
time taken to calculate encounters for  1rishabhtrivedi and  anuj7chauhan  is  0:00:01.507435
time taken to calculate encounters for  1rishabhtrivedi and  arch.b80  is  0:00:00.752619
time taken to calculate encounters for  1rishabhtrivedi and  arkg1996  is  0:00:00.868063
time taken to calculate encounters for  1rishabhtrivedi and  bhagyeshvikani  is  0:00:01.532095
time taken to calculate encounters for  1rishabhtrivedi and  bhardwaj.rish  is  0:00:01.000812
time taken to calculate encounters for  1rishabhtrivedi and  coolsush89  is  0:00:01.405305
time taken to calculate encounters for  1rishabhtrivedi and  desai.deshna  is  0:00:01.144305
time taken to calculate encounters for  1rishabhtrivedi and  diptanshujain  is  0:00:01.023224
time taken to calculate encounters for  1rishabhtrivedi and  drasht

