# Creative Extension Analysis

## Tools

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from numba import njit
import itertools


import geopandas as gpd
from geopy import distance
from shapely.geometry import Point
from haversine import haversine

## Collect and Work on Data

In [2]:
g_b = nx.read_edgelist("data/Brightkite_edges.txt",create_using=nx.Graph(), nodetype = int)
g_g = nx.read_edgelist("data/Gowalla_edges.txt",create_using=nx.Graph(), nodetype = int)

In [3]:
print("Brightkite network has {} nodes and {} edges".format(len(g_b.nodes()), len(g_b.edges())))
print("Gowalla network has {} nodes and {} edges".format(len(g_g.nodes()), len(g_g.edges())))

Brightkite network has 58228 nodes and 214078 edges
Gowalla network has 196591 nodes and 950327 edges


In [4]:
checkins_b = pd.read_csv("data/Brightkite_totalCheckins.txt", delimiter='\t', names=["user", "checkin_time", "latitude", "longitude", "location_id"])
checkins_g = pd.read_csv("data/Gowalla_totalCheckins.txt", delimiter='\t', names=["user", "checkin_time", "latitude", "longitude", "location_id"])

In [5]:
checkins_b.head()

Unnamed: 0,user,checkin_time,latitude,longitude,location_id
0,0,2010-10-17T01:48:53Z,39.747652,-104.99251,88c46bf20db295831bd2d1718ad7e6f5
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc


In [6]:
print("Brightkite checkins dataset have {} rows".format(len(checkins_b)))
print("Gowalla checkins dataset have {} rows".format(len(checkins_g)))

Brightkite checkins dataset have 4747287 rows
Gowalla checkins dataset have 6442892 rows


In [7]:
print(checkins_b.isna().sum())
print()
print(checkins_g.isna().sum())

user            0
checkin_time    6
latitude        6
longitude       6
location_id     6
dtype: int64

user            0
checkin_time    0
latitude        0
longitude       0
location_id     0
dtype: int64


In [8]:
checkins_b = checkins_b.dropna()

In [9]:
checkins_b["checkin_time"] = pd.to_datetime(checkins_b["checkin_time"], format = "%Y-%m-%dT%H:%M:%SZ")
checkins_g["checkin_time"] = pd.to_datetime(checkins_g["checkin_time"], format = "%Y-%m-%dT%H:%M:%SZ")

In [10]:
print(len(checkins_b[((checkins_b["longitude"] < -180) | (checkins_b["longitude"] > 180)) | ((checkins_b["latitude"] < -90) | (checkins_b["latitude"] > 90))]))
print(len(checkins_g[((checkins_g["longitude"] < -180) | (checkins_g["longitude"] > 180)) | ((checkins_g["latitude"] < -90) | (checkins_g["latitude"] > 90))]))

109
29


In [11]:
to_remove_b = checkins_b[((checkins_b["longitude"] < -180) | (checkins_b["longitude"] > 180)) | ((checkins_b["latitude"] < -90) | (checkins_b["latitude"] > 90))]
to_remove_g = checkins_g[((checkins_g["longitude"] < -180) | (checkins_g["longitude"] > 180)) | ((checkins_g["latitude"] < -90) | (checkins_g["latitude"] > 90))]

checkins_b = checkins_b.drop(to_remove_b.index)
checkins_g = checkins_g.drop(to_remove_g.index)

In [12]:
checkins_b.head()

Unnamed: 0,user,checkin_time,latitude,longitude,location_id
0,0,2010-10-17 01:48:53,39.747652,-104.99251,88c46bf20db295831bd2d1718ad7e6f5
1,0,2010-10-16 06:02:04,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2
2,0,2010-10-16 03:48:54,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79
3,0,2010-10-14 18:25:51,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683
4,0,2010-10-14 00:21:47,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc


In [13]:
checkins_b['month'] = checkins_b['checkin_time'].apply(lambda x: x.month)
checkins_b['day'] = checkins_b['checkin_time'].apply(lambda x: x.day)
checkins_b['hour'] = checkins_b['checkin_time'].apply(lambda x: x.hour)
checkins_b['minute'] = checkins_b['checkin_time'].apply(lambda x: x.minute)

In [14]:
checkins_b = checkins_b.drop(columns=['location_id', 'checkin_time'])

In [15]:
checkins_b.head()

Unnamed: 0,user,latitude,longitude,month,day,hour,minute
0,0,39.747652,-104.99251,10,17,1,48
1,0,39.891383,-105.070814,10,16,6,2
2,0,39.891077,-105.068532,10,16,3,48
3,0,39.750469,-104.999073,10,14,18,25
4,0,39.752713,-104.996337,10,14,0,21


## Exploratory Data Analysis

## Research Questions

### Which countries travel the most long distance by plane?

In [18]:
copy_checkins_b = checkins_b.copy()

In [17]:
"""
Assign a cell_number based on the cantor pairing function and discretization into 25km * 25km cells.
"""
@njit
def assign_cell(lat, lon):
    lon_km = 111.320 * np.cos(np.deg2rad(lat)) * lon
    lat_km = 110.574 * lat
    #assign to intervals using the cantor pair function
    lat_km, lon_km = lat_km // 25, lon_km // 25 #now lat_km and lon_km contains the quotient from the division by 25.
    return (1/2)*(lat_km + lon_km)*(lat_km + lon_km + 1) + lon_km

In [20]:
copy_checkins_b["cell_number"] = assign_cell(copy_checkins_b["latitude"].values, copy_checkins_b["longitude"].values)
copy_checkins_b["cell_number"] = copy_checkins_b["cell_number"].astype('int')

In [23]:
center_cells_b = copy_checkins_b[['latitude', 'longitude', 'cell_number']].groupby('cell_number').mean()
center_cells_b = center_cells_b.reset_index()
center_cells_b.head()

Unnamed: 0,cell_number,latitude,longitude
0,-286,64.545703,-149.087119
1,-284,64.1525,-145.842222
2,-282,63.562699,-142.300615
3,-280,63.934833,-145.788211
4,-279,63.661389,-144.064444


Now we work on the home: Home is now the center of each cell with the most checkins.

In [24]:
user_cell_b = copy_checkins_b.groupby(["user", "cell_number"]).count()
user_cell_b = user_cell_b.drop(columns = ["month", 'day', 'hour', "latitude", "longitude"]) #We don't need them anymore\n",
user_cell_b.columns = ["count"] #rename the column to count
user_cell_b = user_cell_b.sort_values("count").groupby(level=0).tail(1).sort_values('user') #Sort by user to have user 0 first\n",
user_cell_b = user_cell_b.reset_index().drop(columns=["count"]) #Reset the index to break the multiindex and keep only the column cell_number\n",

user_cell_b.head()

Unnamed: 0,user,cell_number
0,0,16660
1,1,34813
2,2,16660
3,3,34813
4,4,51736


Now we can merge the two to have the home for each user as the center of the cell where it has the most checkins.

In [25]:
homes_b = user_cell_b.merge(center_cells_b, on='cell_number')
homes_b.head()

Unnamed: 0,user,cell_number,latitude,longitude
0,0,16660,39.731991,-104.980205
1,2,16660,39.731991,-104.980205
2,8,16660,39.731991,-104.980205
3,12,16660,39.731991,-104.980205
4,13,16660,39.731991,-104.980205


Now for each home, we want to retrieve the country based on its coordinates.

In [26]:
import reverse_geocoder

In [27]:
homes_countries_b = homes_b.copy()

In [34]:
def retrieve_country(lat, lon):
    return reverse_geocoder.search((lat, lon))[0]['cc']

retrieve_country_vec = np.vectorize(retrieve_country)

In [None]:
homes_countries_b['country'] = retrieve_country_vec(homes_countries_b['latitude'], 
                                                    homes_countries_b['longitude'])

In [None]:
homes_countries_b.head()

### Where do people from different countries travel to the most?

### Check if it is possible to predict user home areas based on their long distance travel patterns?