In [None]:
import pandas as pd
import os
import glob
import numpy as np
import mobileDataToolkit.analysis as analysis
from numba import cuda
import matplotlib.pyplot as plt
import seaborn as sns

### To-Do: Parallelize this with CUDA to save days

Further To-Do: obtain the following graphs from the data
* Temporal daily distribution of observations (Fig 6)
* Time interval distribution between two consecutive observations (Fig 3)
* Cumulative distribution of location accuracy (Fig 5)

In [None]:
folderpath="C:/Users/ekino/UW/cis_bigdata2021 - Seattle_2000/Seattle_2000_in_2020/Seattle_2000/"
os.chdir(folderpath)
cnt=0
all_file_num=len(glob.glob("*.csv"))

binlens = [10080*60, 1440*60, 360*60, 60*60]
oneweek = np.array([])
oneday = np.array([])
sixhours = np.array([])
onehour = np.array([])
diff = np.array([])
loc_acc = np.array([])

# Create dataframe to store temporal distribution
temp_distribution = pd.DataFrame(columns=['timestamp', 'unix_min', 'day'])

#for i in binlens:
#    print("Start bin length: ", i, " seconds")
for file in glob.glob("*.csv"):
    Suball = pd.read_csv(folderpath+file,header=0)
    Suball['timestamp'] = pd.to_datetime(Suball['timestamp'])
    Suball['unix_min'] = Suball['timestamp'].astype(np.int64) // 10**9
    Suball = Suball.sort_values(by=['unix_min'])

    # Obtain difference between consecutive timestamps
    Suball['diff'] = Suball['unix_min'].diff()
    diff = np.append(diff, Suball['diff'])

    # Day of the week, in words
    Suball['day'] = Suball['timestamp'].dt.day_name()

    # Add to dataframe the relevant columns
    temp_distribution = pd.concat((temp_distribution, Suball[['timestamp', 'unix_min', 'day']]))

    # Add location accuracy
    loc_acc = np.append(loc_acc, Suball['precision'])

    # Count number of rows with 'diff' > i
    # cnt = Suball[Suball['diff'] > i].shape[0]
    #print("Number of rows with 'diff' > ", i, " seconds: ", cnt)
    
    #tempocp = analysis.tempOcp(Suball, unix_col = 'unix_min', bin_len = i)
    #print("temporal occupancy: ", tempocp)
    """ if i == 10080*60:
        oneweek = np.append(oneweek, cnt)
    elif i == 1440*60:
        oneday = np.append(oneday, cnt)
    elif i == 360*60:
        sixhours = np.append(sixhours, cnt)
    elif i == 60*60:
        onehour = np.append(onehour, cnt) """

In [None]:
# Create continuous scatterplot of observations through each hour of the day
# Create hour column
temp_distribution['hour'] = temp_distribution['timestamp'].dt.hour

# Create monotonically-increasing 10-minute bins of the day
temp_distribution['10_min_bin'] = temp_distribution['hour'] * 6 + temp_distribution['timestamp'].dt.minute // 10

# Groupby hour and day
temp_distribution_grouped = temp_distribution.groupby(['10_min_bin', 'day']).size().reset_index(name='count')

# Order by day of the week
temp_distribution_grouped['day'] = pd.Categorical(temp_distribution_grouped['day'], categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)

In [None]:
# Make 10x20 figure
plt.figure(figsize=(10, 5))
# Create continuous scatterplot, use small points to avoid overlapping
sns.scatterplot(data=temp_distribution_grouped, x='10_min_bin', y='count', hue='day', s=9)
# Skip every other x-axis label
plt.xticks(np.arange(0, 144, 12), np.arange(0, 24, 2))
plt.xlabel('Hour')
plt.ylabel('Number of observations')
# No legend title
plt.legend(title=None)
plt.show()

In [None]:
# Save figure configurations, make it a lineplot with different types of lines
plt.figure(figsize=(10, 5))
sns.lineplot(data=temp_distribution_grouped, x='10_min_bin', y='count', hue='day', style='day',dashes=True)
plt.xticks(np.arange(0, 144, 12), np.arange(0, 24, 2))
plt.xlabel('Hour')
plt.ylabel('Number of observations')
plt.legend(title=None)
plt.show()

Location Accuracy

In [None]:
# Drop NAN values for loc acc
loc_acc = loc_acc[~np.isnan(loc_acc)]

# Drop the lowest 1 and 99 percentile of location accuracy
loc_acc = loc_acc[(loc_acc > np.percentile(loc_acc, 1)) & (loc_acc < np.percentile(loc_acc, 99))]

# Create cumulative distribution of location accuracy
loc_acc_sorted = np.sort(loc_acc)

p = 1. * np.arange(len(loc_acc_sorted)) / (len(loc_acc_sorted) - 1)

# Create cumulative distribution
loc_acc_cum = np.cumsum(loc_acc_sorted) / np.sum(loc_acc_sorted)

In [None]:
# Create CDF of location accuracy
plt.figure(figsize=(10, 5))
plt.plot(loc_acc_sorted, p)
plt.ylabel('Percentile (%)')
# Make y-axis percentages
plt.yticks(np.arange(0, 1.1, 0.1), np.arange(0, 110, 10))
# Show until 400 meters
plt.xlim(0, 400)
plt.xlabel('Location accuracy (m)')
# Grid
plt.grid()
plt.show()

In [None]:
# 25th percentile of location accuracy
print("25th percentile of location accuracy: ", np.percentile(loc_acc, 25))

# 50th percentile of location accuracy
print("50th percentile of location accuracy: ", np.percentile(loc_acc, 50))

# 75th percentile of location accuracy
print("75th percentile of location accuracy: ", np.percentile(loc_acc, 75))

# 81st percentile of location accuracy
print("81st percentile of location accuracy: ", np.percentile(loc_acc, 82))

# 95th percentile of location accuracy
print("95th percentile of location accuracy: ", np.percentile(loc_acc, 95))

# 98th percentile of location accuracy
print("98th percentile of location accuracy: ", np.percentile(loc_acc, 98))

# 99th percentile of location accuracy
print("99th percentile of location accuracy: ", np.percentile(loc_acc, 99))

In [None]:
# Describe diff, non scientific notation
print("Describe diff: ", pd.DataFrame(diff).describe().to_string(float_format=lambda x: '%.3f' % x))

In [None]:
# Drop nan values for diff
diff = diff[~np.isnan(diff)]

# Drop maximum value for diff
diff = diff[diff != np.max(diff)]

# Drop the top 2% of values for diff
diff = diff[diff < np.percentile(diff, 99)]

In [None]:
# Plot histogram of diff, but have percent on y-axis, have a log scale on x-axis
plt.figure(figsize=(10, 5))
plt.hist(diff, bins=10000, density=True)
plt.xscale('log')
plt.xlabel('Difference between consecutive timestamps (seconds)')
plt.ylabel('Percent of observations')
plt.show()

In [None]:
# Plot histogram of diff
plt.figure(figsize=(10, 5))
plt.hist(diff, bins=1000)
plt.xlabel('Difference between consecutive timestamps (seconds)')
plt.ylabel('Number of observations')
# Limit to 1500 seconds
plt.xlim(-10, 1500)
plt.show()

In [None]:
# single entry function
@cuda.jit(device=True)
def s(Suball, i, 
      oneweek = np.array([]), 
	  oneday = np.array([]), 
	  sixhours = np.array([]), 
	  onehour = np.array([]), 
	  diff = np.array([]), 
	  loc_acc = np.array([]), 
	  temp_distribution = pd.DataFrame(columns=['timestamp', 'unix_min', 'day'])):	
	Suball['timestamp'] = pd.to_datetime(Suball['timestamp'])
	Suball['unix_min'] = Suball['timestamp'].astype(np.int64) // 10**9
	Suball = Suball.sort_values(by=['unix_min'])

	# Obtain difference between consecutive timestamps
	Suball['diff'] = Suball['unix_min'].diff()
	diff = np.append(diff, Suball['diff'])

	# Day of the week, in words
	Suball['day'] = Suball['timestamp'].dt.day_name()

	# Add to dataframe the relevant columns
	temp_distribution = pd.concat((temp_distribution, Suball[['timestamp', 'unix_min', 'day']]))

	# Add location accuracy
	loc_acc = np.append(loc_acc, Suball['precision'])

	# Count number of rows with 'diff' > i
	cnt = Suball[Suball['diff'] > i].shape[0]

	if i == 10080*60:
		oneweek = np.append(oneweek, cnt)
	elif i == 1440*60:
		oneday = np.append(oneday, cnt)
	elif i == 360*60:
		sixhours = np.append(sixhours, cnt)
	elif i == 60*60:
		onehour = np.append(onehour, cnt)

@cuda.jit
def s_kernel(d_x, i):
	n = d_x.shape[0]
	j = cuda.grid(1)
	if j < n:
		d_x[j] = s(d_x[j], i)
	
def sArray(x):
	n = x.shape[0]
	d_x = cuda.to_device(x) # d_ means the device side copy of the array
	d_f = cuda.device_array_like(d_x) # allocate device array for f
	blockDims = TBP
	gridDims = (n + blockDims - 1) // blockDims
	s_kernel[gridDims, blockDims](d_f, d_x)
	return d_f.copy_to_host() # copy result back to host


In [None]:
folderpath="C:/Users/ekino/UW/cis_bigdata2021 - Seattle_2000/Seattle_2000_in_2020/Seattle_2000/"
os.chdir(folderpath)
cnt=0
all_file_num=len(glob.glob("*.csv"))

binlens = [10080*60, 1440*60, 360*60, 60*60]
oneweek = np.array([])
oneday = np.array([])
sixhours = np.array([])
onehour = np.array([])
diff = np.array([])
loc_acc = np.array([])

# Create dataframe to store temporal distribution
temp_distribution = pd.DataFrame(columns=['timestamp', 'unix_min', 'day'])

#def parallel_loop(binlens, oneweek, oneday, sixhours, onehour):
for i in binlens:
    print("Start bin length: ", i, " seconds")
    for file in glob.glob("*.csv"):
        Suball = pd.read_csv(folderpath+file,header=0)
        Suball['timestamp'] = pd.to_datetime(Suball['timestamp'])
        Suball['unix_min'] = Suball['timestamp'].astype(np.int64) // 10**9
        Suball = Suball.sort_values(by=['unix_min'])

        # Obtain difference between consecutive timestamps
        Suball['diff'] = Suball['unix_min'].diff()
        #diff = np.append(diff, Suball['diff'])
        # Count number of rows with 'diff' > i
        cnt = Suball[Suball['diff'] > i].shape[0]

        #tempocp = analysis.tempOcp(Suball, unix_col = 'unix_min', bin_len = i)
        #print("temporal occupancy: ", tempocp)
        if i == 10080*60:
            oneweek = np.append(oneweek, cnt)
        elif i == 1440*60:
            oneday = np.append(oneday, cnt)
        elif i == 360*60:
            sixhours = np.append(sixhours, cnt)
        elif i == 60*60:
            onehour = np.append(onehour, cnt)

In [None]:
# Plot each numpy array as a histogram
import matplotlib.pyplot as plt
plt.hist(oneweek, bins=20)
plt.title('1 week')
plt.show()

plt.hist(oneday, bins=20)
plt.title('1 day')
plt.show()

plt.hist(sixhours, bins=20)
plt.title('6 hours')
plt.show()

plt.hist(onehour, bins=20)
plt.title('1 hour')
plt.show()

# Plot all numpy arrays as a histogram
plt.hist(oneweek, bins=20, alpha=0.9, label='1 week', color='C0')
plt.hist(oneday, bins=20, alpha=0.6, label='1 day', color='C1')
plt.hist(sixhours, bins=20, alpha=0.5, label='6 hours', color='C2')
plt.hist(onehour, bins=20, alpha=0.4, label='1 hour', color='C3')
# Give x and y titles
plt.xlabel('Number of missing periods per user')
plt.ylabel('Frequency')
plt.legend(loc='upper right')
# Limit to 600 on the y-axis
plt.ylim(0, 600)
plt.show()

# Plot just 1 week, 1 day using same colors as above
plt.hist(oneweek, bins=20, alpha=0.9, label='1 week', color='C0')
plt.hist(oneday, bins=20, alpha=0.6, label='1 day', color='C1')
plt.legend(loc='upper right')
# Limit to 600 on the y-axis
plt.ylim(0, 600)
plt.show()


In [None]:
# See percent of users that have at least one week of missingness
print("Percent of users that have at least one week of missingness ", len(oneweek[oneweek != 0])/len(oneweek))

# See percent of users that have at least one day of missingness
print("Percent of users that have at least one day of missingness ", len(oneday[oneday != 0])/len(oneday))

# See percent of users that have at least six hours of missingness
print("Percent of users that have at least six hours of missingness ", len(sixhours[sixhours != 0])/len(sixhours))

# See percent of users that have at least one hour of missingness
print("Percent of users that have at least one hour of missingness ", len(onehour[onehour != 0])/len(onehour))


In [None]:
# See percent of users that have more than one week of missingness
print("Percent of users that have at least one week of missingness ", len(oneweek[oneweek > 1])/len(oneweek))

# See percent of users that have more than one day of missingness
print("Percent of users that have at least one day of missingness ", len(oneday[oneday > 1])/len(oneday))

# See percent of users that have more than six hours of missingness
print("Percent of users that have at least six hours of missingness ", len(sixhours[sixhours > 1])/len(sixhours))

# See percent of users that have more than one hour of missingness
print("Percent of users that have at least one hour of missingness ", len(onehour[onehour > 1])/len(onehour))


In [None]:
pwd

In [None]:
# Save numpy arrays to csv
np.savetxt("oneweek.csv", oneweek, delimiter=",")
np.savetxt("oneday.csv", oneday, delimiter=",")
np.savetxt("sixhours.csv", sixhours, delimiter=",")
np.savetxt("onehour.csv", onehour, delimiter=",")