In [1]:
# Preprocessing and Helper Functions
# First, let's set up the preprocessing and helper functions that will be used by the RecSys models.

import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import timedelta


In [None]:
# Load the dataset
train_data = pd.read_csv('train_set.csv')

# Convert date columns to datetime format
train_data['checkin'] = pd.to_datetime(train_data['checkin'])
train_data['checkout'] = pd.to_datetime(train_data['checkout'])

# Sort the data by user trip ID and check-in date to maintain the chronological order
train_data.sort_values(by=['utrip_id', 'checkin'], inplace=True)

# Create a city_country column
train_data['city_country'] = train_data['city_id'].astype(str) + '_' + train_data['hotel_country'].astype(str)


In [None]:
#Create city and country chains with additional features

In [2]:
# Initialize tqdm for progress tracking
tqdm.pandas()

# Function to calculate trip duration
def calculate_trip_duration(checkin, checkout):
    return (checkout - checkin).days

# Function to calculate stay duration
def calculate_stay_duration(checkin, checkout):
    return (checkout - checkin).days

# Group by utrip_id and create the city and country chains with additional features
trip_chains = train_data.groupby('utrip_id').progress_apply(lambda group: pd.Series({
    'user_id': group['user_id'].iloc[0],
    'cities_chain': list(group['city_id']),
    'countries_chain': list(group['hotel_country']),
    'trip_duration': calculate_trip_duration(group['checkin'].iloc[0], group['checkout'].iloc[-1]),
    'stay_durations': list(group.apply(lambda row: calculate_stay_duration(row['checkin'], row['checkout']), axis=1)),
    'device_classes': list(group['device_class']),
    'affiliate_ids': list(group['affiliate_id']),
    'checkin_months': list(group['checkin'].dt.month),
    'checkin_days_of_week': list(group['checkin'].dt.dayofweek)
})).reset_index()

# Rename columns for clarity
trip_chains.columns = ['utrip_id', 'user_id', 'cities_chain', 'countries_chain', 'trip_duration', 'stay_durations',
                       'device_classes', 'affiliate_ids', 'checkin_months', 'checkin_days_of_week']

# Create a new DataFrame for trip chains
trip_chains_df = trip_chains.copy()

# Add city_country chains to the DataFrame
trip_chains_df['city_country_chain'] = trip_chains_df.apply(
    lambda row: [f"{city}_{country}" for city, country in zip(row['cities_chain'], row['countries_chain'])], axis=1)

# Display the first few rows of the trip_chains_df DataFrame
print(trip_chains_df.head())


100%|█████████████████████████████████| 217686/217686 [01:11<00:00, 3049.63it/s]


    utrip_id  user_id                                      cities_chain  \
0  1000027_1  1000027                       [8183, 15626, 60902, 30628]   
1  1000033_1  1000033               [38677, 52089, 21328, 27485, 38677]   
2  1000045_1  1000045  [64876, 55128, 9608, 31817, 36170, 58178, 36063]   
3  1000083_1  1000083                      [55990, 14705, 35160, 36063]   
4   100008_1   100008                 [11306, 12096, 6761, 6779, 65690]   

                                     countries_chain  trip_duration  \
0                   [Gondal, Gondal, Gondal, Gondal]              8   
1  [Cobra Island, Cobra Island, Cobra Island, Cob...             10   
2  [Fook Island, Fook Island, Fook Island, Carpat...             11   
3          [Osterlich, Osterlich, Osterlich, Gondal]              5   
4  [Kamistan, Kamistan, Kamistan, Kamistan, Kamis...              9   

          stay_durations                                     device_classes  \
0           [1, 2, 2, 3]               [des

In [4]:
# Write the trip_chains_df DataFrame to an Excel file
output_file = 'trip_chains_enhanced.xlsx'
trip_chains_df.to_excel(output_file, index=False)
print(f'Trip chains written to {output_file}')


Trip chains written to trip_chains_enhanced.xlsx


In [5]:
#Generating Transition Pairs and Calculating Transition Probabilities:
# MARKOV CHAINS
    
transitions = []

for chain in trip_chains_df['city_country_chain']:
    for i in range(len(chain) - 1):
        transitions.append((chain[i], chain[i + 1]))

transitions_df = pd.DataFrame(transitions, columns=['current_place', 'next_place'])
transition_counts = transitions_df.groupby('current_place')['next_place'].value_counts(normalize=True).unstack(fill_value=0)
print(transition_counts.head())

next_place             10002_Elbonia  10005_Fook Island  10006_Bartovia  \
current_place                                                             
10001_Trans-Carpathia            0.0                0.0             0.0   
10002_Elbonia                    0.0                0.0             0.0   
10006_Bartovia                   0.0                0.0             0.0   
10007_Novistrana                 0.0                0.0             0.0   
10009_Carpathia                  0.0                0.0             0.0   

next_place             10007_Novistrana  10010_Bandaria  10012_Gondal  \
current_place                                                           
10001_Trans-Carpathia               0.0             0.0           0.0   
10002_Elbonia                       0.0             0.0           0.0   
10006_Bartovia                      0.0             0.0           0.0   
10007_Novistrana                    0.0             0.0           0.0   
10009_Carpathia                     

In [6]:
#Predicting the Next City-Country and Adding Predictions to DataFrame:

def predict_next_place(current_place):
    if current_place in transition_counts.index:
        return transition_counts.loc[current_place].idxmax()
    else:
        return None

trip_chains_df['predict_next_city_country'] = trip_chains_df['city_country_chain'].apply(
    lambda chain: predict_next_place(chain[-1]) if len(chain) > 0 else None)
print(trip_chains_df.head())

    utrip_id  user_id                                      cities_chain  \
0  1000027_1  1000027                       [8183, 15626, 60902, 30628]   
1  1000033_1  1000033               [38677, 52089, 21328, 27485, 38677]   
2  1000045_1  1000045  [64876, 55128, 9608, 31817, 36170, 58178, 36063]   
3  1000083_1  1000083                      [55990, 14705, 35160, 36063]   
4   100008_1   100008                 [11306, 12096, 6761, 6779, 65690]   

                                     countries_chain  trip_duration  \
0                   [Gondal, Gondal, Gondal, Gondal]              8   
1  [Cobra Island, Cobra Island, Cobra Island, Cob...             10   
2  [Fook Island, Fook Island, Fook Island, Carpat...             11   
3          [Osterlich, Osterlich, Osterlich, Gondal]              5   
4  [Kamistan, Kamistan, Kamistan, Kamistan, Kamis...              9   

          stay_durations                                     device_classes  \
0           [1, 2, 2, 3]               [des

In [7]:
# Writing the output of the predicted next city to an Excel file
output_file = 'trip_chains_enhanced_withprednextcity.xlsx'
trip_chains_df.to_excel(output_file, index=False)
print(f'Trip chains written to {output_file}')

Trip chains written to trip_chains_enhanced_withprednextcity.xlsx
