In [None]:
os.chdir('..')

In [None]:

from model_config import *
from model_packages import *
from model_utils import *
import re

In [None]:
# constants

# complete dataset
model_input_data= pd.read_pickle(data_folder+'static_and_dynamic_features_5000.pkl')

# poi features
pois= pd.read_pickle(data_folder+'new_pois_data_all_sites.pkl')

# census features
census = pd.read_pickle('./data/non_baseline_features.pkl')

# data with UK regions 
regions= gpd.read_file('./data/NUTS_RG_20M_2021_4326.geojson')

# constants for columns
trgt_ftrs=['people_counter_data']
dynmc_ftrs=['total_trip_count']
wthr_ftrs=['tavg']
othr_ftrs=['Dog']    
catg_ftrs=['Date', 'site']
natr_ftrs_non_corr=['accessible_green_space_area','PROW_Total_length_km','waterside_length_km']
season=[ 'autumn', 'spring', 'summer']




## Functions

In [None]:
def get_season(month):
    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'autumn'
    else:
        return 'winter'

In [None]:
def create_season_columns(df):
    df['Month'] = df['Date'].str.split('-', expand=True)[1].astype(int)
    df['Season'] = df['Month'].apply(get_season)
    season_columns = pd.get_dummies(df['Season'])
    df = pd.concat([df, season_columns], axis=1)
    df.drop(columns=['Month', 'Season', 'winter'], inplace=True)
    return df

In [None]:
def preprocess_input_data(input_data):
    # raw data wrangling
    # create geo dataframe
    df= gpd.GeoDataFrame(input_data)

    # drop sites that create negative prediction values- to be investigated
    df= df[~df['counter'].isin(['Vessey_Pastures', 'Trosley_CP'])]
    # assign geometry as centre of buffer
    df['geometry']= df.geometry.centroid
    # extract lat lon
    df['lon']= df.geometry.x
    df['lat']= df.geometry.y

    # create season columns
    df = create_season_columns(df)

    # # replace nan values with 0s as Nan not allowed in model training.
    df[pois.columns]= df[pois.columns].fillna(0) 

    # combine minority classes in land_type features 
    lnd_dict={'major_urban_settings':'urban_settings',\
    'minor_urban_settings':'urban_settings'}    
    df['land_type_labels'].replace(lnd_dict,inplace=True)

    # create land and habitat feature constants
    lnd_ftrs=['land_type_labels_'+x for x in list(df['land_type_labels'].unique())]
    hbt_ftrs=['land_habitat_labels_'+x for x in list(df['land_habitat_labels'].unique())]

    return df, lnd_ftrs, hbt_ftrs


In [None]:
# identify sites in each region
def find_counter_regions(region_df, counter_df):

    # select desired geography level i.e. North East, North West etc
    regions=region_df.loc[(region_df['CNTR_CODE']=='UK')&(region_df['LEVL_CODE']==1)]
    # create data frame with geometry of region related to each counter location
    regions_geom=gpd.sjoin(left_df=counter_df, right_df=regions[['geometry', 'NUTS_NAME']], how='left')

    regions_geom= regions_geom[['counter', 'NUTS_NAME']].drop_duplicates()

    counter_regions= {}
    for region in regions_geom['NUTS_NAME'].unique():
        counter_regions[f'{region}']= regions_geom.loc[regions_geom['NUTS_NAME'].isin([region])]
    
    return counter_regions



In [None]:
# clustering based on coorinates or on land_type_lables

def training_test_split(df, cluster_coordinates, k, test_size):
    
    if cluster_coordinates == True:

        # create dummy variables for categorical data
        df=pd.get_dummies(df,columns=['land_type_labels'])
        df=pd.get_dummies(df,columns=['land_habitat_labels'])

        # kmeans clustering on counter coordinates
        coords = df[['site','lat','lon']].drop_duplicates()
        # coordinates = df[['lat', 'lon']]

        # KMeans clustering
        kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
        coords['cluster_label'] = kmeans.fit_predict(coords[['lat', 'lon']])

        # Stratified splitting of the data based on cluster lables
        train_set, test_set = train_test_split(coords, test_size=test_size, stratify=coords['cluster_label'])

        # subset original data to train and test data based on cluster lables
        train_data = df.loc[df.site.isin(train_set.site)]
        test_data = df.loc[df.site.isin(test_set.site)]

        print(coords['cluster_label'].value_counts())
        print(f'Train sites:{list(train_set.site)}, Test sites: {list(test_set.site)}')
    
    else:
        # stratify splitting based on land_type_lable
        land_type = df[['site','land_type_labels']].drop_duplicates()
        train_set, test_set = train_test_split(land_type, test_size=test_size, stratify=land_type['land_type_labels'])
        
        # create dummy variables for categorical data
        df=pd.get_dummies(df,columns=['land_type_labels'])
        df=pd.get_dummies(df,columns=['land_habitat_labels'])

        # subset original data to train and test data based on cluster lables
        train_data = df.loc[df.site.isin(train_set.site)]
        test_data = df.loc[df.site.isin(test_set.site)]

        print(f'Train sites:{list(train_set.site)}, Test sites: {list(test_set.site)}')

    return df, train_data, test_data


In [None]:
def map_sites(geo_df, counter_regions, train_data, test_data):
    # Create list of SE and SW counter locations
    se_sw_counters = list(counter_regions['South West (England)'].counter) + list(counter_regions['South East (England)'].counter)

    # Subset to only sites within SE & SW and in test or train data sets
    se_sw_counters_locations_test = geo_df.loc[(geo_df.counter.isin(list(test_data.counter)))]
    se_sw_counters_locations_train = geo_df.loc[(geo_df.counter.isin(list(train_data.counter)))]

    # Plot the locations
    fig, ax = plt.subplots(figsize=(10, 10))  # Adjust the figure size as needed
    train_scatter = se_sw_counters_locations_train.plot(ax=ax, markersize=30, color='#fa6401', marker='*', alpha=0.7, label='Train Data', zorder=3)
    test_scatter = se_sw_counters_locations_test.plot(ax=ax, markersize=30, color='#902082', marker='o', alpha=0.7, label='Test Data', zorder=3)

    # Add Basemap
    contextily.add_basemap(ax, crs=geo_df.crs.to_string(), source=contextily.providers.CartoDB.Voyager)

    # Customize map appearance
    ax.axis('off')
    legend = ax.legend(loc='upper right', fontsize='large', title='Site Type', title_fontsize='large', frameon=True)  # Frame added
    legend.get_frame().set_color('white')  # Set the frame color to white (or any other color)
    legend.get_frame().set_edgecolor('black')  # Set the frame edge color

    for handle in legend.legendHandles:  # Set the same alpha for legend markers as in the plot
        handle.set_alpha(0.7)

    ax.set_title('Train and Test set Counter Locations', fontsize=20)  # Optional title

    plt.savefig(f"./images/train_test_sites.png", format= 'png', dpi=300, bbox_inches='tight')

    plt.show()



In [None]:
def vif(df, lnd_ftrs, hbt_ftrs):

   # create reference category for land habitat and land type
   habitat_type_reference= ['mixed_settings', 'Grassland_woodland_bareground']

   # select input features
   ftrs_to_keep=dynmc_ftrs+wthr_ftrs+othr_ftrs+natr_ftrs_non_corr+\
   lnd_ftrs+hbt_ftrs+catg_ftrs+list(pois.columns)+list(census.columns)

   ftrs_to_keep=[x for x in ftrs_to_keep if x not in habitat_type_reference]
  
   print(list(ftrs_to_keep))
   # For each ftr, calculate VIF and save in dataframe
   vif_1 = pd.DataFrame()

   df_num=df[[x for x in ftrs_to_keep if x not in [target,  'geometry',
   'geom_type', 'Date', 'site', 'counter', 'provider']]].select_dtypes(include=np.number).dropna(axis=0)
   vif_1["VIF Factor"] = [variance_inflation_factor(df_num.values, i) for \
                     i in range(df_num.shape[1])]
   vif_1["features"] = df_num.columns


   print(vif_1)

   print('+'*100)

   ftrs_to_chck=[x for x in ftrs_to_keep if x not in [target,'total_trip_count','tavg',  'geometry',
   'geom_type', 'Date', 'site', 'counter', 'provider', 'amenity_holiday_park', 'tourism_yes', 'amenity_waste_basket']]
               
   df_num_remv_multi_coll=calculate_vif_(df_num[ftrs_to_chck].select_dtypes(include=np.number).dropna(axis=0),\
                                             thresh=10)[0]

   vif = pd.DataFrame()
   vif["VIF Factor"] = [variance_inflation_factor(df_num_remv_multi_coll.values, i) for \
                     i in range(df_num_remv_multi_coll.shape[1])]
   vif["features"] = df_num_remv_multi_coll.columns


   #Vif removed features
   low_vif_ftrs_df_train_num=list(vif.features.values)

   # possibly correlated features
   #low_vif_ftrs_df_train_num=ftrs_to_chck
               
   low_vif_ftrs_df_train_num=low_vif_ftrs_df_train_num+['total_trip_count','tavg']
   print(low_vif_ftrs_df_train_num)   

   return low_vif_ftrs_df_train_num+[target] 


In [None]:
def preprocess_dataframe(dataframe, columns_to_drop, drop):
    # Drop specified columns
    # processed_df= dataframe
    if drop == True:
        processed_df = dataframe.drop(columns_to_drop, axis=1)

        # Rename columns by removing non-alphanumeric characters
        processed_df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)
    else:
        processed_df = dataframe

        # Rename columns by removing non-alphanumeric characters
        processed_df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)

    return processed_df

In [None]:
def predictions(train_data, test_data, low_vif_ftrs, normalisation):

  # normalisation?
  if normalisation== True:
    norm=True
  else:
    norm=False  

  df_train= train_data[low_vif_ftrs+['counter', 'Date']].dropna(axis=0)
  df_test= test_data[low_vif_ftrs+['counter', 'Date']].dropna(axis=0)
  # Setup for PyCaret
  s = setup(data = df_train.copy(),#[low_vif_ftrs].dropna(axis=0).copy(), 
          target = 'people_counter_data',
          train_size=0.9,
          numeric_features = [x for x in list(df_train.select_dtypes(include=np.number).columns)\
                                  if x not in ['people_counter_data']], 
                                  fold = 3, 
                                  preprocess= False,
                                  normalize=norm,
                                  normalize_method = 'robust',
                                  remove_outliers=False, 
                                  remove_multicollinearity=False,
                                  multicollinearity_threshold=0.8,
                                  feature_selection=False,
                                  ignore_features=['Date','site', 'counter'],
                                  polynomial_features=False,
                                  pca=False, 
                                  log_experiment=True,
                                  experiment_name='reg_experiments', 
                                  log_plots=True,
                                  transformation=False,
                                  #   fold_strategy = 'timeseries'
                                )
  

  # Model training and tuning
  # best = compare_models(n_select=1, include=['lr'], sort='MAE', fold=5)#, 'rf', 'et', 'gbr', 'lightgbm'], sort='MAE', fold=5)
  top5 = compare_models(n_select=5, sort='MAE', fold=5, include=['lr', 'ridge', 'lasso', 'en', 'br', 'kr'])
  tuned_top5 = [tune_model(i, n_iter=120, optimize='MAE', fold=5, verbose= False) for i in top5]
  blender_specific = blend_models(estimator_list=tuned_top5, fold=5, optimize='MAE')

  finalize_blender = finalize_model(blender_specific)
  save_model(finalize_blender, data_folder+'voting_regressor_model') 
   
  
  # Predictions
  print('Performance metrics from training data:')
  pred_on_train = predict_model(blender_specific, data=df_train)
  pred_on_train.to_pickle(data_folder+'training_predictions.pkl')

  print('Performance metrics from test data:')
  pred_on_test = predict_model(blender_specific, data=df_test)
  pred_on_test.to_pickle(data_folder+'test_predictions.pkl')

  # finalise and save model after predictions as finalise trains model on complete data set(not train/test split)
  finalize_blender = finalize_model(blender_specific)
  save_model(finalize_blender, data_folder+'voting_regressor_model') 

  
  return pred_on_train, pred_on_test

In [None]:
def model_development_pipeline(input_df, test_size, cluster_coordinates, normalisation):

    print(f'>>>>>>>>>>>>> test_size= {test_size}, cluster_coordinates? {cluster_coordinates}, data normalised? {normalisation}')
    # preprocessing on input df
    df, lnd_ftrs, hbt_ftrs= preprocess_input_data(input_df)

    # identify counte regions
    counter_regions= find_counter_regions(regions, df)

    # train and test split of data
    df,train_data,test_data = training_test_split(df, cluster_coordinates, 3, test_size)

    # visualise train and test sites on a map
    map_sites(df, counter_regions, train_data, test_data)

    # calculate low VIF features
    low_vif_ftrs= vif(df, lnd_ftrs, hbt_ftrs)

    # train model and make predictions
    pred_on_train, pred_on_test = predictions(train_data, test_data, low_vif_ftrs, normalisation)

    return pred_on_train, pred_on_test


# Run Model

In [None]:
def main():
    # run model with parameters that produced best performance in
    pred_on_train, pred_on_test= model_development_pipeline(model_input_data, 0.5, True, False)
    

if __name__ == "__main__":
    main()