#### We'll will compare the following models for their accuracy and precision. 

1. Collaborative Filtering (Item-Based)
2. Markov Chains
3. Random Forest
4. Gradient Boosting Machine
5. Recurrent Neural Networks (RNN) - LSTM

To get us started we will set up the preprocessing and helper functions that will be used by all models.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from tqdm import tqdm


In [2]:
#pip install keras

In [3]:
#pip install tensorflow

In [4]:
#pip install nbformat

In [5]:
# Initialize tqdm for progress tracking
tqdm.pandas()

# Load the datasets
train_data = pd.read_csv('train_set_1.csv')
test_data = pd.read_csv('test_set_1.csv')

# Convert date columns to datetime format
train_data['checkin'] = pd.to_datetime(train_data['checkin'])
train_data['checkout'] = pd.to_datetime(train_data['checkout'])
test_data['checkin'] = pd.to_datetime(test_data['checkin'])
test_data['checkout'] = pd.to_datetime(test_data['checkout'])

# Sort the data by user trip ID and check-in date to maintain the chronological order
train_data.sort_values(by=['utrip_id', 'checkin'], inplace=True)
test_data.sort_values(by=['utrip_id', 'checkin'], inplace=True)

# Create a city_country column
train_data['city_country'] = train_data['city_id'].astype(str) + '_' + train_data['hotel_country'].astype(str)
test_data['city_country'] = test_data['city_id'].astype(str) + '_' + test_data['hotel_country'].astype(str)

# Group by utrip_id to create sequences
train_sequences = train_data.groupby('utrip_id')['city_country'].apply(list).tolist()
test_sequences = test_data.groupby('utrip_id')['city_country'].apply(list).tolist()

# Encode city_country strings as integers
encoder = LabelEncoder()
all_cities_countries = [city_country for seq in train_sequences + test_sequences for city_country in seq]
encoder.fit(all_cities_countries)
encoded_train_sequences = [encoder.transform(seq) for seq in train_sequences]
encoded_test_sequences = [encoder.transform(seq) for seq in test_sequences]

# Prepare data for training models
def prepare_data(sequences):
    X, y = [], []
    for seq in sequences:
        for i in range(1, len(seq)):
            X.append(seq[:i])
            y.append(seq[i])
    X = pad_sequences(X, padding='pre')
    y = np.array(y)
    return X, y

X_train, y_train = prepare_data(encoded_train_sequences)
X_test, y_test = prepare_data(encoded_test_sequences)


  train_data = pd.read_csv('train_set_1.csv')
  train_data['checkin'] = pd.to_datetime(train_data['checkin'])
  train_data['checkout'] = pd.to_datetime(train_data['checkout'])
  test_data['checkin'] = pd.to_datetime(test_data['checkin'])
  test_data['checkout'] = pd.to_datetime(test_data['checkout'])


In [16]:
# Print the first 5 rows of X_train and y_train
print("First 5 rows of X_train:")
print(X_train[:50])

print("\nFirst 5 elements of y_train:")
print(y_train[:50])

# Convert to pandas DataFrame for better readability
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.Series(y_train, name='Target')

# Convert to pandas DataFrame for better readability
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.Series(y_test, name='Target')

# Display the first 5 rows
print("\nFirst 5 rows of X_train (as DataFrame):")
print(X_train_df.head())

print("\nFirst 5 elements of y_train (as DataFrame):")
print(y_train_df.head())

# Display the first 5 rows
print("\nFirst 5 rows of X_train (as DataFrame):")
print(X_test_df.head())

print("\nFirst 5 elements of y_train (as DataFrame):")
print(y_test_df.head())

First 5 rows of X_train:
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 6377]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 6377  630]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 6377  630 5492]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 3088]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 3088 4569]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 3088 4569 1237]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0 3088 4569 1237 1913]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 5927]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 5927 4908]
 [   0    0    0    0    0    

In [14]:
unique_city_country = set(city_country for seq in sequences for city_country in seq)
print("Unique city_country values:", unique_city_country)

NameError: name 'sequences' is not defined

### Collaborative Filtering (Item-Based)

In [7]:
#from sklearn.metrics.pairwise import cosine_similarity
# Create item-based collaborative filtering model
#item_sim_matrix = cosine_similarity(np.identity(len(encoder.classes_)))

# Initialize tqdm for progress tracking
tqdm.pandas()

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.preprocessing import OneHotEncoder

classes = encoder.classes_

# Create one-hot encoded matrix
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(classes.reshape(-1, 1))

# Calculate cosine similarity matrix
item_sim_matrix = cosine_similarity(onehot_encoded)

def collaborative_filtering_predict(current_place):
    if current_place in encoder.classes_:
        current_idx = encoder.transform([current_place])[0]
        most_similar_idx = np.argmax(item_sim_matrix[current_idx])
        return encoder.inverse_transform([most_similar_idx])[0]
    else:
        return None

collab_preds = [collaborative_filtering_predict(encoder.inverse_transform([seq[-1]])[0]) for seq in encoded_test_sequences]
print("Collab Filtering Complete")

Collab Filtering Complete


### Markov Chains

In [8]:
# Create transition pairs from the city_country chains
transitions = []

for chain in train_sequences:
    for i in range(len(chain) - 1):
        transitions.append((chain[i], chain[i + 1]))

# Create a DataFrame for transitions
transitions_df = pd.DataFrame(transitions, columns=['current_place', 'next_place'])

# Calculate transition probabilities
transition_counts = transitions_df.groupby('current_place')['next_place'].value_counts(normalize=True).unstack(fill_value=0)

# Function to predict the next place based on the current place
def markov_chain_predict(current_place):
    if current_place in transition_counts.index:
        return transition_counts.loc[current_place].idxmax()
    else:
        return None

markov_preds = [markov_chain_predict(encoder.inverse_transform([seq[-1]])[0]) for seq in encoded_test_sequences]
print("Markov Complete")

Markov Complete


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict the next city_country
rf_preds = rf_model.predict(X_test)
rf_preds = encoder.inverse_transform(rf_preds)

print("Random Forest Complete")

ValueError: X has 21 features, but RandomForestClassifier is expecting 20 features as input.

### Gradient Boost

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

# Create and train the gradient boosting model
gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_model.fit(X_train, y_train)

# Predict the next city_country
gbm_preds = gbm_model.predict(X_test)
gbm_preds = encoder.inverse_transform(gbm_preds)

print("GBM Complete")

ValueError: X has 21 features, but GradientBoostingClassifier is expecting 20 features as input.

### Recurrent Neural Networks (RNN) - LSTM

In [None]:
# Define the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=len(encoder.classes_), output_dim=50, input_length=X_train.shape[1]))
lstm_model.add(LSTM(100, return_sequences=False))
lstm_model.add(Dense(len(encoder.classes_), activation='softmax'))

# Compile the model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Predict the next city_country
lstm_preds = lstm_model.predict(X_test)
lstm_preds = np.argmax(lstm_preds, axis=1)
lstm_preds = encoder.inverse_transform(lstm_preds)

print("LSTM Complete")


### Model Comparison

In [13]:
# Helper function to evaluate models
def evaluate_model(y_true, y_pred):
    y_true_encoded = encoder.transform(y_true)
    y_pred_encoded = encoder.transform(y_pred)
    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded, average='weighted')
    return accuracy, precision

# Evaluate all models
#collab_accuracy, collab_precision = evaluate_model(y_test, collab_preds)
markov_accuracy, markov_precision = evaluate_model(y_test, markov_preds)
#rf_accuracy, rf_precision = evaluate_model(y_test, rf_preds)
#gbm_accuracy, gbm_precision = evaluate_model(y_test, gbm_preds)
#lstm_accuracy, lstm_precision = evaluate_model(y_test, lstm_preds)

# Print the results
#print(f"Collaborative Filtering - Accuracy: {collab_accuracy:.2f}, Precision: {collab_precision:.2f}")
print(f"Markov Chains - Accuracy: {markov_accuracy:.2f}, Precision: {markov_precision:.2f}")
#print(f"Random Forest - Accuracy: {rf_accuracy:.2f}, Precision: {rf_precision:.2f}")
#print(f"Gradient Boosting - Accuracy: {gbm_accuracy:.2f}, Precision: {gbm_precision:.2f}")
#print(f"LSTM - Accuracy: {lstm_accuracy:.2f}, Precision: {lstm_precision:.2f}")


ValueError: y contains previously unseen labels: '3452'

In [None]:
# Convert collab_preds to DataFrame
collab_preds_df = pd.DataFrame(collab_preds, columns=['predicted_next_city_country'])

# Optionally, add a column for the original test sequences for reference
# Assuming you want to add the last element from each sequence as the current city
current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
collab_preds_df['current_city_country'] = current_city

# Save the DataFrame to a CSV file
output_file = 'collab_predictions.csv'
collab_preds_df.to_csv(output_file, index=False)
print(f'Predictions written to {output_file}')

In [None]:
# Convert markov_preds to DataFrame
markov_preds_df = pd.DataFrame(markov_preds, columns=['predicted_next_city_country'])

# Optionally, add a column for the original test sequences for reference
# Assuming you want to add the last element from each sequence as the current city
current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
markov_preds_df['current_city_country'] = current_city

# Save the DataFrame to a CSV file
output_file = 'markov_predictions.csv'
markov_preds_df.to_csv(output_file, index=False)
print(f'Predictions written to {output_file}')


In [None]:

# Convert gbm_preds to DataFrame
gbm_preds_df = pd.DataFrame(gbm_preds, columns=['predicted_next_city_country'])

# Optionally, add a column for the original test sequences for reference
# Assuming you want to add the last element from each sequence as the current city
current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
gbm_preds_df['current_city_country'] = current_city

# Save the DataFrame to a CSV file
output_file = 'gbm_predictions.csv'
gbm_preds_df.to_csv(output_file, index=False)
print(f'Predictions written to {output_file}')


In [None]:
# Convert lstm_preds to DataFrame
lstm_preds_df = pd.DataFrame(lstm_preds, columns=['predicted_next_city_country'])

# Optionally, add a column for the original test sequences for reference
# Assuming you want to add the last element from each sequence as the current city
current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
lstm_preds_df['current_city_country'] = current_city

# Save the DataFrame to a CSV file
output_file = 'lstm_predictions.csv'
lstm_preds_df.to_csv(output_file, index=False)
print(f'Predictions written to {output_file}')