In [1]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import pickle

# Load dataset
df = pd.read_csv("Merged_Dataset.csv")

# Handling missing values
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna("Unknown", inplace=True)

# Drop duplicates
df.drop_duplicates(inplace=True)

# Handling outliers
df = df[df["Rating"] <= 5]

# Encoding categorical variables
label_encoders = {}
#for col in ["VisitMode_x", "VisitMode_y", "ContenentId", "CountryId_x", "AttractionType"]:
for col in ["ContenentId", "CountryId_x", "AttractionType"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature Engineering
df["User_Visit_Count"] = df.groupby("UserId")["AttractionId"].transform("count")
df["Avg_User_Rating"] = df.groupby("UserId")["Rating"].transform("mean")
df["Attraction_Popularity"] = df.groupby("AttractionId")["Rating"].transform("mean")

# Scaling numerical features
scaler = MinMaxScaler()
df[["Avg_User_Rating", "Attraction_Popularity"]] = scaler.fit_transform(df[["Avg_User_Rating", "Attraction_Popularity"]])
scaler = StandardScaler()
df[['Rating']] = scaler.fit_transform(df[['Rating']])
scaler = MinMaxScaler()
df[["Avg_User_Rating", "Attraction_Popularity"]] = scaler.fit_transform(df[["Avg_User_Rating", "Attraction_Popularity"]])

# Splitting dataset for models
#X_reg = df[["ContenentId", "CountryId_x", "VisitMode_x", "VisitMode_y", "VisitMonth", "AttractionTypeId"]]
X_reg = df[["ContenentId", "CountryId_x","VisitMonth", "AttractionTypeId"]]
y_reg = df["Rating"]
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

model_reg = RandomForestRegressor()
model_reg.fit(X_train_reg, y_train_reg)

X_clf = df[["ContenentId", "CountryId_x", "User_Visit_Count", "Attraction_Popularity"]]
y_clf = df["VisitMode_y"]
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

model_clf = RandomForestClassifier()
model_clf.fit(X_train_clf, y_train_clf)

# Recommendation System Functions
def recommend_attractions_content_based(attraction_id, df, n=5):
    attraction_features = df[df["AttractionId"] == attraction_id][["AttractionType", "UserId"]].values
    knn = NearestNeighbors(n_neighbors=n, metric="cosine")
    knn.fit(df[["AttractionType", "UserId"]])
    distances, indices = knn.kneighbors(attraction_features)
    recommendations = df.iloc[indices[0]]["Attraction"].tolist()
    return recommendations

def recommend_attractions_collaborative(user_id, df, n=5):
    user_item_matrix = df.pivot_table(index="UserId", columns="AttractionId", values="Rating", aggfunc="mean")
    user_sparse_matrix = csr_matrix(user_item_matrix.fillna(0))
    knn = NearestNeighbors(metric="cosine", algorithm="brute")
    knn.fit(user_sparse_matrix)
    user_index = user_item_matrix.index.get_loc(user_id)
    distances, indices = knn.kneighbors(user_sparse_matrix[user_index], n_neighbors=n + 1)
    similar_users = indices.flatten()[1:]
    recommendations = []
    for sim_user in similar_users:
        top_attractions = user_item_matrix.iloc[sim_user].sort_values(ascending=False).index[:n]
        recommendations.extend(top_attractions)
    return list(set(recommendations))[:n]

# SQLite Integration
conn = sqlite3.connect('tourism_db.db')
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS merged_dataset (
        TransactionId INTEGER,
        UserId INTEGER,
        VisitYear INTEGER,
        VisitMonth INTEGER,
        VisitMode_x INTEGER,
        AttractionId INTEGER,
        Rating REAL,
        ContenentId INTEGER,
        RegionId INTEGER,
        CountryId_x INTEGER,
        CityId_x INTEGER,
        AttractionCityId INTEGER,
        AttractionTypeId INTEGER,
        Attraction TEXT,
        AttractionAddress TEXT,
        CityId_y INTEGER,
        CityName TEXT,
        CountryId_y INTEGER,
        AttractionType INTEGER,
        VisitModeId INTEGER,
        VisitMode_y INTEGER
    )
''')

df_sql = df[['TransactionId', 'UserId', 'VisitYear', 'VisitMonth', 'VisitMode_x', 'AttractionId', 'Rating',
           'ContenentId', 'RegionId', 'CountryId_x', 'CityId_x', 'AttractionCityId', 'AttractionTypeId',
           'Attraction', 'AttractionAddress', 'CityId_y', 'CityName', 'CountryId_y', 'AttractionType',
           'VisitModeId', 'VisitMode_y']]

df_sql = df_sql.replace({np.nan: None})
data_tuples = [tuple(x) for x in df_sql.to_numpy()]

insert_query = """
    INSERT OR REPLACE INTO merged_dataset VALUES (
        ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
    )
"""
cursor.executemany(insert_query, data_tuples)
conn.commit()

# Pickle files creation
pickle.dump(model_reg, open('model_reg.pkl', 'wb'))
pickle.dump(model_clf, open('model_clf.pkl', 'wb'))
pickle.dump(df, open('df.pkl', 'wb'))
pickle.dump(label_encoders, open('label_encoders.pkl', 'wb'))
conn.close()

print("Pickle files created successfully!")

Pickle files created successfully!
