In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor  # Assuming it's a regression task


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [4]:
df= pd.read_csv('cleaned_and_visualized_data.csv')
df

Unnamed: 0,user_id,Yearly_avg_view_on_travel_page,frequentflyer,preferred_device,total_likes_on_outstation_checkin_given,yearly_avg_Outstation_checkins,annual_incom_class,member_in_family,booking_hotal,preferred_location_type,working_flag,travelling_rating
0,100001,96.0,0.0,Desktop,168.0,11.0,Avera,3.0,Yes,OTHER,1.0,3.0
1,100002,67.0,0.0,Mobile,875.0,8.0,Low,3.0,No,OTHER,1.0,2.0
2,100003,76.0,1.0,Mobile,67.0,10.0,Avera,5.0,Yes,Beach,1.0,1.0
3,100004,64.0,1.0,Mobile,249.0,10.0,High,1.0,No,Big Cities,1.0,3.0
4,100005,21.0,1.0,Mobile,695.0,4.0,High,3.0,Yes,OTHER,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,109996,35.0,1.0,Mobile,97.0,8.0,High,2.0,No,Historical Site,1.0,2.0
9996,109997,92.0,0.0,Mobile,61.0,1.0,Avera,5.0,Yes,OTHER,1.0,5.0
9997,109998,6.0,0.0,Desktop,535.0,1.0,High,1.0,No,OTHER,1.0,2.0
9998,109999,96.0,0.0,Desktop,49.0,4.0,Avera,2.0,Yes,Big Cities,1.0,1.0


In [20]:
# Define categorical and numerical features
categorical_features = ['preferred_device', 'annual_incom_class', 'booking_hotal', 'preferred_location_type', 'working_flag']
numerical_features = ['Yearly_avg_view_on_travel_page', 'frequentflyer', 'total_likes_on_outstation_checkin_given', 
                      'yearly_avg_Outstation_checkins', 'member_in_family', 'travelling_rating']


In [21]:
# Create preprocessing pipeline with imputation and one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
    ])

In [22]:
# Drop rows with missing values in the target variable
df.dropna(subset=['Daily_Avg_mins_spend_on_traveling_page'], inplace=True)

# Assuming 'Daily_Avg_mins_spend_on_traveling_page' is the target variable
X = df.drop('Daily_Avg_mins_spend_on_traveling_page', axis=1)
y = df['Daily_Avg_mins_spend_on_traveling_page']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Fit the model on training data
model.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe

In [None]:
# Evaluate the model on test data
score = model.score(X_test, y_test)

# Convert R2 score to percentage
score_percentage = score * 100

print(f'Model R2 Score: {score_percentage:.2f}%')



Model R2 Score: 82.44%


In [None]:
# Import the linear regression model
from sklearn.linear_model import LinearRegression

# Create a new pipeline with the linear regression model
linear_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the linear model on training data
linear_model.fit(X_train, y_train)

# Evaluate the linear model on test data
linear_score = linear_model.score(X_test, y_test)
print(f'Linear Model R2 Score: {linear_score}')

# Save the linear model
import joblib
joblib.dump(linear_model, 'linear_model.pkl')

# Print a message indicating the successful model fitting and evaluation
print('Linear Model trained and evaluated successfully!')


Linear Model R2 Score: 0.6807047967092081
Linear Model trained and evaluated successfully!


In [None]:
# Import additional regression models
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Create pipelines for additional models
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

svr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

knn_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

# List of models for iteration
models = {
    'Random Forest': model,
    'Linear Regression': linear_model,
    'Ridge Regression': ridge_model,
    'Support Vector Regression': svr_model,
    'K-Nearest Neighbors': knn_model
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(f'{model_name} R2 Score: {score}')

# Save the best model based on R2 score
best_model = max(models, key=lambda k: model.score(X_test, y_test))
joblib.dump(models[best_model], f'{best_model}_best_model.pkl')

# Print a message indicating the successful model fitting and evaluation
print(f'{best_model} is the best model, and it has been saved successfully!')


Random Forest R2 Score: 0.8243807304965192
Linear Regression R2 Score: 0.6807047967092081
Ridge Regression R2 Score: 0.6805372440679939
Support Vector Regression R2 Score: -0.012404149662818442
K-Nearest Neighbors R2 Score: 0.541590104906603
Random Forest is the best model, and it has been saved successfully!


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Create a pipeline for the Gradient Boosting model
gradient_boosting_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Train and evaluate the Gradient Boosting model
gradient_boosting_model.fit(X_train, y_train)
gradient_boosting_score = gradient_boosting_model.score(X_test, y_test)
print(f'Gradient Boosting R2 Score: {gradient_boosting_score}')

# Save the Gradient Boosting model
joblib.dump(gradient_boosting_model, 'gradient_boosting_model.pkl')

# Print a message indicating the successful model fitting and evaluation
print('Gradient Boosting model has been saved successfully!')


Gradient Boosting R2 Score: 0.7594870730049668
Gradient Boosting model has been saved successfully!


In [None]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/24/ec/ad387100fa3cc2b9b81af0829b5ecfe75ec5bb19dd7c19d4fea06fb81802/xgboost-2.0.3-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.2/99.8 MB 5.0 MB/s eta 0:00:21
   ---------------------------------------- 0.3/99.8 MB 3.3 MB/s eta 0:00:31
   ---------------------------------------- 0.3/99.8 MB 3.1 MB/s eta 0:00:33
   ---------------------------------------- 0.4/99.8 MB 2.6 MB/s eta 0:00:39
   ---------------------------------------- 0.5/99.8 MB 2.1 MB/s eta 0:00:49
   ---------------------------------------- 0.5/99.8 MB 2.1 MB/s eta 0:00:49
   ---------------------------------------- 0.5/99


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


   ---------------------------------------- 1.0/99.8 MB 1.0 MB/s eta 0:01:36
   ---------------------------------------- 1.1/99.8 MB 1.0 MB/s eta 0:01:35
   ---------------------------------------- 1.1/99.8 MB 1.1 MB/s eta 0:01:34
   ---------------------------------------- 1.1/99.8 MB 1.0 MB/s eta 0:01:35
   ---------------------------------------- 1.1/99.8 MB 997.1 kB/s eta 0:01:39
   ---------------------------------------- 1.2/99.8 MB 1.0 MB/s eta 0:01:36
    --------------------------------------- 1.3/99.8 MB 1.1 MB/s eta 0:01:31
    --------------------------------------- 1.4/99.8 MB 1.1 MB/s eta 0:01:27
    --------------------------------------- 1.5/99.8 MB 1.1 MB/s eta 0:01:28
    --------------------------------------- 1.7/99.8 MB 1.2 MB/s eta 0:01:19
    --------------------------------------- 2.0/99.8 MB 1.4 MB/s eta 0:01:13
    --------------------------------------- 2.1/99.8 MB 1.4 MB/s eta 0:01:09
    --------------------------------------- 2.3/99.8 MB 1.5 MB/s eta 0:01:

In [None]:
from xgboost import XGBRegressor

# Create a pipeline for the XGBoost model
xgboost_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

# Train and evaluate the XGBoost model
xgboost_model.fit(X_train, y_train)
xgboost_score = xgboost_model.score(X_test, y_test)
print(f'XGBoost R2 Score: {xgboost_score}')

# Save the XGBoost model
joblib.dump(xgboost_model, 'xgboost_model.pkl')

# Print a message indicating the successful model fitting and evaluation
print('XGBoost model has been saved successfully!')


XGBoost R2 Score: 0.8831014748722026
XGBoost model has been saved successfully!


In [None]:
pip install lightgbm

Collecting lightgbmNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/e1/4c/4685ccfae9806f561de716e32549190c1f533dde5bcadaf83bdf23972cf0/lightgbm-4.3.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.3 MB 1.2 MB/s eta 0:00:02
   -------- ------------------------------- 0.3/1.3 MB 2.9 MB/s eta 0:00:01
   --------------- ------------------------ 0.5/1.3 MB 3.7 MB/s eta 0:00:01
   ------------------- -------------------- 0.7/1.3 MB 3.8 MB/s eta 0:00:01
   ----------------------- ---------------- 0.8/1.3 MB 3.3 MB/s eta 0:00:01
   ------------------------ --------------- 0.8/1.3 MB 3.0 MB/s eta 0:00:01
   ------------------------ --------------- 0.8/1.3 MB 2.7 MB/s eta 0:

In [None]:
from lightgbm import LGBMRegressor

# Create a pipeline for the LightGBM model
lgbm_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LGBMRegressor(random_state=42))
])

# Train and evaluate the LightGBM model
lgbm_model.fit(X_train, y_train)
lgbm_score = lgbm_model.score(X_test, y_test)
print(f'LightGBM R2 Score: {lgbm_score}')

# Save the LightGBM model
joblib.dump(lgbm_model, 'lgbm_model.pkl')

# Print a message indicating the successful model fitting and evaluation
print('LightGBM model has been saved successfully!')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1145
[LightGBM] [Info] Number of data points in the train set: 9407, number of used features: 29
[LightGBM] [Info] Start training from score 13.624854
LightGBM R2 Score: 0.8245744813453761
LightGBM model has been saved successfully!


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import r2_score

# Convert y_train and y_test to NumPy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Create preprocessing pipeline with imputation and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_features),
    ])

# Fit the ColumnTransformer on the training data
preprocessor.fit(X_train)

# Build a neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(preprocessor.transform(X_train), y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# Evaluate the model on test data
y_pred = model.predict(preprocessor.transform(X_test))
neural_network_score = r2_score(y_test, y_pred)
print(f'Neural Network R2 Score: {neural_network_score}')

# Save the neural network model
model.save('neural_network_model.h5')

# Print a message indicating the successful model fitting and evaluation
print('Neural network model has been saved successfully!')


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).