In [61]:
import pandas as pd
import numpy as np
import sys

r_path_data = "../../src/utils/load_data/"
sys.path.append(r_path_data)
from load_dataframes import *


import geopandas as gpd
from shapely.geometry import Point

# Trajectpry Clustering
def join_customer_features(traj_result, username, season, country):
    """
    Returns a dataframe with trajectory clustering results and customer features joined
    Params:
    traj_result: dataframe with trajectory clustering result: customer_nr,column called cluster
    username: username to access aws
    season: season for clustering used 
    country: country used for clustering (note: there is NO option for all)
    """
    user_features=get_k_means_data(username,season, country).set_index("customer_nr")
    features_with_trajectory=user_features.join(traj_result.set_index('customer_nr')[["cluster"]])
    return features_with_trajectory


def trajectory_cluster_description(result, cluster_names, var):
    """
    Returns a description about each cluster:
        how much time they spent in Tuscany
        places visited (at least 4 hours spent)
        airport arrivals and departures
    """
    hours=hours_tusc(result, var)
    nc=len(hours)
    res=''
    if var=='label':
        st=0
    else:
        st=1
    for i in zip(range(st,nc), cluster_names[:nc]):
        res=res + f'The {i[1]} cluster '
        res=res + f'spends on average {int(hours.hrs_in_tusc[(i[0])])} days in Tuscany, '
        res=res+get_places_at_least4_hours(result, i[0], var)
        res=res+ cluster_airport_result(result, i[0], var)
    return res


def write_file(country, season, final, var):
    """
    write out the print into a file at the result folder
    """
    if var=='label':
        path='../results/kmeans/'
    elif var=='cluster':
        path='../results/sequence_analysis/'
    country_ = country.lower()
    season_ = season.replace('-','_')
    file_name=country_+"_"+season_
    newpath=path+file_name+'/'
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    f = open(newpath+file_name+".txt","w")   
    f.write(final)
    f.close()


def get_trajectory_description(traj_result, username, season, country, var, cluster_names,print_it=True):
    """
    Prints out description for clusters
    Params:
    traj_result: dataframe with trajectory clustering result: customer_nr,column called cluster
    username: username to access aws
    season: names of the season used for clustering
    country: name of the country (all=all country)
    var: 'label': k-means results
         'cluster': trajectory results
    """
    result=join_customer_features(traj_result, username, season, country)
    result=add_airport_arrivals(result, airport_cities_d)
    final=(create_basic_description(result, season, country, var)
    +" "+trajectory_cluster_description(result, cluster_names, var))
    if print_it==True:
         write_file(country, season, final, var)
    print(final)


ModuleNotFoundError: No module named 'connect_db'

In [26]:
seasons={'pre-summer':'pre-summer season (May 2017)',
        'summer':'summer season (Jun - Aug 2017)>',
        'post-summer':'post-summer season (Sep - Nov, 2017)',
        'winter':'winter season (Dec 2017 - Feb 2018)'}

In [27]:
season='winter'
season_name=seasons[season.lower()]
num_visitors_in_millions=0.7
country="hungary"
num_clusters=4

In [41]:
pwd

'/mnt/data/ovasarhelyi/TPT_tourism/dev/descriptives'

In [47]:
import pandas as pd
import numpy as np

In [39]:
filename='cluster_results_Germany_summer_0d_to_30d_WDaligned_FALSE_win_8_wCtryTRUE_N_30000_CONSTANT_LCS_NClus_4.csv'

In [44]:
d=pd.read_csv("../../results/sequence_analysis/Germany_summer/"+filename)

In [54]:
def join_customer_features(traj_result, username, season, country):
    """
    Returns a dataframe with trajectory clustering results and customer features joined
    Params:
    traj_result: dataframe with trajectory clustering result: customer_nr,column called cluster
    username: username to access aws
    season: season for clustering used 
    country: country used for clustering (note: there is NO option for all)
    """
    user_features=get_k_means_data(username,season, country).set_index("customer_nr")
    features_with_trajectory=user_features.join(traj_result.set_index('customer_nr')[["cluster"]])
    return features_with_trajectory

30000

In [56]:
result=join_customer_features(d, 'ovasarhelyi', 'summer', 'germany')

NameError: name 'get_k_means_data' is not defined

In [52]:
def calc_num_visitors_in_millions(result):
    
    return np.round(len(result)/100000,2)

In [53]:
calc_num_visitors_in_millions(result)

0.3

In [28]:
summary_text=f"In the last {season_name} roughly {num_visitors_in_millions} million \
tourists visited Tuscany from {country.title()}. \
The data shows us {num_clusters} clusters. Each line in the graph above represents a cluster's typical path that tourists \
from {country.title()} followed. \
These paths are are displayed as differently-coloured lines in the map here above."

In [29]:
summary_text

"In the last winter season (Dec 2017 - Feb 2018) roughly 0.7 million tourists visited Tuscany from Hungary. The data shows us 4 clusters. Each line in the graph above represents a cluster's typical path that tourists from Hungary followed. These paths are are displayed as differently-coloured lines in the map here above."

In [30]:
i=1
cluster_numbers=['first', 'second', 'third', 'forth', 'fifth', 'sixth', "seventh", 'eighth']
names=['city hoppers', 'coast lovers', 'round trippers', 'wierdos']
cluster_number=cluster_numbers[i]
cluster_name=names[i]
avg_num_days_italy=6
avg_num_days_tuscany=4
cities_results='Florence and Pisa'

In [31]:
cluster_medoid_description=f"The {cluster_number} cluster, which we named {cluster_name} \
spends on average {avg_num_days_italy} days in Italy, \
{avg_num_days_tuscany} of which in Tuscany. \
When in Tuscany {cluster_name} spend at least a half day in {cities_results}. \
\n"

In [32]:
cluster_medoid_description

'The second cluster, which we named coast lovers spends on average 6 days in Italy, 4 of which in Tuscany. When in Tuscany coast lovers spend at least a half day in Florence and Pisa. \n'

In [33]:
cluster_name=names[i]
season_name=seasons[season.lower()]
most_visited_munipality='Florence'
ratio_most_visited_municipality=str(34)
second_most_visited_municipality='Pisa'
ratio_second_most_visited_municipality=str(20)
third_most_visited_municipality="Livorno"
ratio_third_most_visited_municipality=str(5)

In [34]:
cluster_result=f"""In the heatmap above, we can see the density of the visits of all tourists belonging \
to the {cluster_name} cluster. \
The darker the colour, the more {cluster_name} visited that municipality in {season_name}. \
As we can see from the  heatmap, the majority of the tourists from  this cluster visited {most_visited_munipality} ({ratio_most_visited_municipality}% of the tourists in this cluster). \
The next most visited municipalities are {second_most_visited_municipality} \
({ratio_second_most_visited_municipality}%), \
and {third_most_visited_municipality} ({ratio_third_most_visited_municipality}%).
Besides the heatmap, tn the plot above we can see four examples of trajectories of tourists who belong to the {cluster_name} cluster."""

In [35]:
cluster_result

'In the heatmap above, we can see the density of the visits of all tourists belonging to the coast lovers cluster. The darker the colour, the more coast lovers visited that municipality in winter season (Dec 2017 - Feb 2018). As we can see from the  heatmap, the majority of the tourists from  this cluster visited Florence (34% of the tourists in this cluster). The next most visited municipalities are Pisa (20%), and Livorno (5%).\nBesides the heatmap, tn the plot above we can see four examples of trajectories of tourists who belong to the coast lovers cluster.'

In [None]:

cluster_result=