In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import os
import json
import re

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import scipy.stats as stats

# Json file of travel advisories
filepath = os.path.join("Resources", "csi.json")

# Read in CSV from Data Analysis

In [2]:
# Read in csv from previous analysis to get top 10 cities
travel_sorted_df = pd.read_csv("Resources/analysis.csv")
travel_sorted_df.head()
# List of the top ten cities
top_ten_cities = []
top_ten_dest = travel_sorted_df["DEST_CITY_NAME"].head(10)
# City comes in form of <CITY_NAME>, <COUNTRY>
for dest in top_ten_dest:
    x = dest.split(",")
    top_ten_cities.append(x[0])
top_ten_cities
top_ten_dest = travel_sorted_df["DEST_COUNTRY"].head(10)
# Country Tags extracted from analysis
top_ten_country = [country for country in top_ten_dest]
#top_ten_country
#top_ten_cities

# Read In Travel Advisory JSON (CKCJ)

In [3]:
# Open JSON file and load
with open(filepath, encoding='utf-8') as jsonfile:
    travel_json = json.load(jsonfile)

In [4]:
# Examine keys
travel_json[0].keys()

dict_keys(['tag', 'geopoliticalarea', 'travel_transportation', 'health', 'local_laws_and_special_circumstances', 'safety_and_security', 'entry_exit_requirements', 'destination_description', 'iso_code', 'travel_embassyAndConsulate', 'last_update_date'])

In [5]:
# Figure out if country city is in has a travel advisory
country_index = []
for country in top_ten_country:
    global country_index
    count = 0
    index = None
    for travel in travel_json:
        if travel['tag'] == country:
            #print(count)
            index = count
            break;
        count += 1
    if(index != None):
        country_index.append(count)
    else:
        country_index.append(None)
# Index in json list of countries, None means no mention
country_index

[132, 73, None, 37, 132, 139, 71, 56, 37, 132]

In [6]:
# Function to search for a string in a line of text.  Returns None if no mention
def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [7]:
# Example of date format attached to travel advisory
temp = travel_json[0]['last_update_date']
temp

'Last Updated: September 23, 2014       '

In [8]:
# Function to get a (month, year) tuple to use for tttest
def getMonthYear(update):
    monthIndex = None
    yearIndex = None
    x = update.split(" ")
    if(x[2] == "January"):
        monthIndex = 1
    elif(x[2] == "February"):
        monthIndex = 2
    elif(x[2] == "March"):
        monthIndex = 3
    elif(x[2] == "April"):
        monthIndex = 4
    elif(x[2] == "May"):
        monthIndex = 5
    elif(x[2] == "June"):
        monthIndex = 6
    elif(x[2] == "July"):
        monthIndex = 7
    elif(x[2] == "August"):
        monthIndex = 8
    elif(x[2] == "September"):
        monthIndex = 9
    elif(x[2] == "October"):
        monthIndex = 10
    elif(x[2] == "November"):
        monthIndex = 11
    elif(x[2] == "December"):
        monthIndex = 12
    
    yearIndex = int(x[4])
    return(monthIndex, yearIndex)
# End of function getMonthYear

#monthTemp, yearTemp = getMonthYear(temp)

# Figure out if city is mentioned in travel advisory
Keep count of mention and create data structure.

In [9]:
# Dictionary to hold results of json search
# Keys are the city name, each entry is a list
# First entry is number of mentions, second is month, year tuple of update in advisory
top_ten_dict = {}

for index in range(len(top_ten_country)):
    countryCount = 0
    city = top_ten_cities[index]
    if country_index[index] != None:
        if(findWholeWord(city)(travel_json[country_index[index]]['travel_transportation']) != None):
            countryCount += 1
        if(findWholeWord(city)(travel_json[country_index[index]]['health']) != None):
            countryCount += 1
        if(findWholeWord(city)(travel_json[country_index[index]]['local_laws_and_special_circumstances']) != None):
            countryCount += 1
        if(findWholeWord(city)(travel_json[country_index[index]]['safety_and_security']) != None):
            countryCount += 1
        if(findWholeWord(city)(travel_json[country_index[index]]['entry_exit_requirements']) != None):
            countryCount += 1
        month, year = getMonthYear(travel_json[country_index[index]]['last_update_date'])
        top_ten_dict[top_ten_cities[index]] = [countryCount, (month, year)]
    else:
        top_ten_dict[top_ten_cities[index]] = [0, None]

top_ten_dict

{'Cancun': [2, (2, 2015)],
 'London': [0, (1, 2016)],
 'Tokyo': [0, None],
 'Toronto': [1, (10, 2014)],
 'Mexico City': [5, (2, 2015)],
 'Amsterdam': [3, (12, 2014)],
 'Paris': [3, (2, 2016)],
 'Punta Cana': [0, (1, 2015)],
 'Vancouver': [1, (10, 2014)],
 'San Jose del Cabo': [0, (2, 2015)]}

# Read in the monthly aggregated data for Top 10 Countries

In [10]:
# Read in monthly analysis of passenger data for ttest
monthly_data_df = pd.read_csv("Resources/analysis_monthly.csv")
# Organize to set index as city then year
monthly_data_df = monthly_data_df.set_index(['DEST_CITY_NAME', 'YEAR'])
monthly_data_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,DEST_COUNTRY,YEARLY_TOTAL,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
DEST_CITY_NAME,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
"Amsterdam, Netherlands",2013,NL,1498533.0,89425.0,73576.0,104179.0,124295.0,155463.0,161462.0,148674.0,165006.0,146622.0,127654.0,96116.0,106061.0
"Amsterdam, Netherlands",2014,NL,1449692.0,92024.0,75205.0,103942.0,110880.0,142314.0,148705.0,132277.0,154280.0,150822.0,132861.0,100080.0,106302.0
"Amsterdam, Netherlands",2015,NL,1540559.0,92506.0,73814.0,102489.0,131397.0,158984.0,163851.0,148313.0,168319.0,161552.0,143006.0,94755.0,101573.0
"Amsterdam, Netherlands",2016,NL,1549281.0,90014.0,75229.0,110462.0,144508.0,161621.0,161598.0,147317.0,159403.0,152280.0,137474.0,98660.0,110715.0
"Amsterdam, Netherlands",2017,NL,1677886.0,93557.0,80137.0,118879.0,153882.0,177235.0,175705.0,159835.0,177119.0,168446.0,144774.0,110022.0,118295.0


# Running T-Test on Cities in Top Ten List (CKCJ)

In [11]:
# Get the city name name to access data sheet to get passenger data
top_ten_dest = travel_sorted_df["DEST_CITY_NAME"].head(10)
index = 0
test_results = []
for key in top_ten_dict.keys():
    if(top_ten_dict[key][1] != None):
        # Get data
        temp_1 = monthly_data_df.loc[(top_ten_dest[index], top_ten_dict[key][1][1]-1), "JAN":"DEC"]
        temp_2 = monthly_data_df.loc[(top_ten_dest[index], top_ten_dict[key][1][1]), "JAN":"DEC"]
        temp_3 = monthly_data_df.loc[(top_ten_dest[index], top_ten_dict[key][1][1]+1), "JAN":"DEC"]
        # Turn into a list so we can easily append and access
        data_1 = [data for data in temp_1]
        data_2 = [data for data in temp_2]
        data_3 = [data for data in temp_3]
        # append data
        all_data = data_1 + data_2 + data_3
        # Passenger data for year previous to travel advisory
        data_before = []
        # Passenger data for year following travel advisory
        data_after = []
        for x in range(top_ten_dict[key][1][0]-1, top_ten_dict[key][1][0]+11):
            data_before.append(all_data[x])
            data_after.append(all_data[x+12])
        # Run ttest and append to test results
        test_results.append(stats.ttest_ind(data_before, data_after, equal_var=False))
    else:
        # No mention in travel advisory so send result to None
        test_results.append(None)
    index += 1

In [12]:
top_ten_dict

{'Cancun': [2, (2, 2015)],
 'London': [0, (1, 2016)],
 'Tokyo': [0, None],
 'Toronto': [1, (10, 2014)],
 'Mexico City': [5, (2, 2015)],
 'Amsterdam': [3, (12, 2014)],
 'Paris': [3, (2, 2016)],
 'Punta Cana': [0, (1, 2015)],
 'Vancouver': [1, (10, 2014)],
 'San Jose del Cabo': [0, (2, 2015)]}

# T-Test Results (CKCJ)

In [13]:
test_results

[Ttest_indResult(statistic=-1.9029704607634663, pvalue=0.07025678050266695),
 Ttest_indResult(statistic=0.23876735659925447, pvalue=0.813519684137803),
 None,
 Ttest_indResult(statistic=0.31917182514250253, pvalue=0.7526600413095357),
 Ttest_indResult(statistic=-0.9768630430726404, pvalue=0.34028886066599473),
 Ttest_indResult(statistic=-0.6564344256710247, pvalue=0.518723845629995),
 Ttest_indResult(statistic=0.0963418597690272, pvalue=0.9241222398385391),
 Ttest_indResult(statistic=-1.356364466617244, pvalue=0.18907145847936957),
 Ttest_indResult(statistic=-0.5210313410089537, pvalue=0.6076548072319916),
 Ttest_indResult(statistic=-0.9824014513553352, pvalue=0.3412963408905325)]

In [14]:
pval_list = []
for result in test_results:
    if(result != None):
        pval_list.append(result[1])
    else:
        pval_list.append(None)

zipped = zip(top_ten_cities, pval_list)
city_pval_list = list(zipped)
city_pval_list

[('Cancun', 0.07025678050266695),
 ('London', 0.813519684137803),
 ('Tokyo', None),
 ('Toronto', 0.7526600413095357),
 ('Mexico City', 0.34028886066599473),
 ('Amsterdam', 0.518723845629995),
 ('Paris', 0.9241222398385391),
 ('Punta Cana', 0.18907145847936957),
 ('Vancouver', 0.6076548072319916),
 ('San Jose del Cabo', 0.3412963408905325)]