<a href="https://colab.research.google.com/github/dharalakshmi/Tourism-Routes/blob/main/Merging%204%20datsets%20with%20api_integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -q kaggle

from google.colab import files
print("Upload your kaggle.json file:")
uploaded = files.upload()

import os, shutil, stat
os.makedirs("/root/.kaggle", exist_ok=True)
shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", stat.S_IRUSR | stat.S_IWUSR)

!kaggle --version


Upload your kaggle.json file:


Saving kaggle.json to kaggle.json
Kaggle API 1.7.4.5


In [4]:
datasets = [
    "saketk511/travel-dataset-guide-to-indias-must-see-places",
    "naqibahmedkadri/famous-indian-tourist-places",
    "kbdharun/a-star-algorithm-route-planning-dataset",
    "parulpandey/indian-cities-database",
]

SAVE_ROOT = globals().get('BASE_DIR', '/content/data')
import os
os.makedirs(SAVE_ROOT, exist_ok=True)

for slug in datasets:
    name = slug.split('/')[-1]
    outdir = os.path.join(SAVE_ROOT, name)
    os.makedirs(outdir, exist_ok=True)
    print(f"Downloading {slug} to {outdir}")
    !kaggle datasets download -d {slug} -p {outdir} --unzip
    !ls -lh {outdir} | head -n 10


Downloading saketk511/travel-dataset-guide-to-indias-must-see-places to /content/data/travel-dataset-guide-to-indias-must-see-places
Dataset URL: https://www.kaggle.com/datasets/saketk511/travel-dataset-guide-to-indias-must-see-places
License(s): DbCL-1.0
Downloading travel-dataset-guide-to-indias-must-see-places.zip to /content/data/travel-dataset-guide-to-indias-must-see-places
  0% 0.00/9.19k [00:00<?, ?B/s]
100% 9.19k/9.19k [00:00<00:00, 25.7MB/s]
total 36K
-rw-r--r-- 1 root root 35K Aug 16 16:24 Top Indian Places to Visit.csv
Downloading naqibahmedkadri/famous-indian-tourist-places to /content/data/famous-indian-tourist-places
Dataset URL: https://www.kaggle.com/datasets/naqibahmedkadri/famous-indian-tourist-places
License(s): unknown
Downloading famous-indian-tourist-places.zip to /content/data/famous-indian-tourist-places
  0% 0.00/402k [00:00<?, ?B/s]
100% 402k/402k [00:00<00:00, 588MB/s]
total 1.2M
-rw-r--r-- 1 root root 123K Aug 16 16:24 City.csv
-rw-r--r-- 1 root root 1.1M A

In [6]:
!pip install googlemaps

Collecting googlemaps
  Downloading googlemaps-4.10.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlemaps
  Building wheel for googlemaps (setup.py) ... [?25l[?25hdone
  Created wheel for googlemaps: filename=googlemaps-4.10.0-py3-none-any.whl size=40714 sha256=f69dfefb685f408d26473a2dd262c9da0ffa60e49314b002e8ed0c3d209e84d6
  Stored in directory: /root/.cache/pip/wheels/f1/09/77/3cc2f5659cbc62341b30f806aca2b25e6a26c351daa5b1f49a
Successfully built googlemaps
Installing collected packages: googlemaps
Successfully installed googlemaps-4.10.0


In [7]:
import pandas as pd
import numpy as np
import googlemaps
from math import radians, sin, cos, sqrt, atan2
import time
import requests
from geopy.geocoders import Nominatim


In [8]:
def load_datasets():
    """Load all four datasets"""
    print("Loading datasets...")

    SAVE_ROOT = globals().get('BASE_DIR', '/content/data')

    # Dataset 1: Travel Dataset Guide to India's Must-See Places
    df1 = pd.read_csv(os.path.join(SAVE_ROOT, 'travel-dataset-guide-to-indias-must-see-places', 'Top Indian Places to Visit.csv'))
    print(f"Dataset 1 shape: {df1.shape}")

    # Dataset 2: Famous Indian Tourist Places
    df2 = pd.read_csv(os.path.join(SAVE_ROOT, 'famous-indian-tourist-places', 'Places.csv'))
    print(f"Dataset 2 shape: {df2.shape}")

    # Dataset 3: A* Algorithm Route Planning Dataset
    df3 = pd.read_csv(os.path.join(SAVE_ROOT, 'a-star-algorithm-route-planning-dataset', 'indian-cities-dataset.csv'))
    print(f"Dataset 3 shape: {df3.shape}")

    # Dataset 4: Indian Cities Database
    df4 = pd.read_csv(os.path.join(SAVE_ROOT, 'indian-cities-database', 'Indian Cities Database.csv'))
    print(f"Dataset 4 shape: {df4.shape}")

    return df1, df2, df3, df4

In [9]:
# Step 2: Data Cleaning and Standardization
def clean_city_names(df, city_column):
    """Standardize city names for consistent merging"""
    df[city_column] = df[city_column].astype(str)
    df[city_column] = df[city_column].str.strip()
    df[city_column] = df[city_column].str.title()

    # Handle common city name variations
    city_mappings = {
        'Bombay': 'Mumbai',
        'Madras': 'Chennai',
        'Calcutta': 'Kolkata',
        'Bangalore': 'Bengaluru',
        'Mysore': 'Mysuru',
        'Trivandrum': 'Thiruvananthapuram'
    }

    df[city_column] = df[city_column].replace(city_mappings)
    return df

In [10]:

def standardize_datasets(df1, df2, df3, df4):
    """Clean and standardize all datasets"""
    print("Standardizing datasets...")

    # Clean Dataset 1 - Tourism places guide
    if 'City' in df1.columns:
        df1 = clean_city_names(df1, 'City')
    elif 'city' in df1.columns:
        df1 = clean_city_names(df1, 'city')

    # Clean Dataset 2 - Famous places
    if 'Location' in df2.columns:
        df2 = clean_city_names(df2, 'Location')
    elif 'city' in df2.columns:
        df2 = clean_city_names(df2, 'city')

    # Clean Dataset 3 - Routes
    if 'start_city' in df3.columns:
        df3 = clean_city_names(df3, 'start_city')
        df3 = clean_city_names(df3, 'end_city')

    # Clean Dataset 4 - Cities
    if 'city_name' in df4.columns:
        df4 = clean_city_names(df4, 'city_name')
    elif 'City' in df4.columns:
        df4 = clean_city_names(df4, 'City')

    return df1, df2, df3, df4

In [11]:
# Step 3: Merge Tourism Datasets (Dataset 1 + Dataset 2)
def merge_tourism_data(df1, df2):
    """Merge the two tourism datasets"""
    print("Merging tourism datasets...")

    # Standardize column names for merging
    # Adjust these based on actual column names in your datasets
    df1_cols = {
        'Place': 'place_name',
        'City': 'city',
        'State': 'state',
        'Description': 'description',
        'Type': 'place_type'
    }

    df2_cols = {
        'Name': 'place_name',
        'Location': 'city',
        'State': 'state',
        'Latitude': 'latitude',
        'Longitude': 'longitude',
        'Rating': 'rating'
    }

    # Rename columns
    df1_renamed = df1.rename(columns=df1_cols)
    df2_renamed = df2.rename(columns=df2_cols)

    # Merge on city name (outer join to keep all places)
    tourism_merged = pd.merge(df1_renamed, df2_renamed,
                             on=['city', 'place_name'],
                             how='outer',
                             suffixes=('_guide', '_famous'))

    print(f"Merged tourism dataset shape: {tourism_merged.shape}")
    return tourism_merged

In [12]:
# Step 4: Add City Coordinates (Dataset 4)
def add_city_coordinates(tourism_df, cities_df):
    """Add city coordinates to tourism data"""
    print("Adding city coordinates...")

    # Standardize cities dataset column names
    cities_cols = {
        'city_name': 'city',
        'state_name': 'state',
        'lat': 'city_latitude',
        'lng': 'city_longitude',
        'population': 'city_population'
    }

    cities_renamed = cities_df.rename(columns=cities_cols)

    # Merge tourism data with city coordinates
    final_tourism = pd.merge(tourism_df, cities_renamed,
                            on='city', how='left')

    # Fill missing coordinates with city coordinates where place coordinates are missing
    final_tourism['latitude'] = final_tourism['latitude'].fillna(final_tourism['city_latitude'])
    final_tourism['longitude'] = final_tourism['longitude'].fillna(final_tourism['city_longitude'])

    print(f"Tourism dataset with coordinates shape: {final_tourism.shape}")
    return final_tourism

In [13]:
# Step 5: Haversine Distance Calculation (Backup for API)
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate straight-line distance between two points"""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan

    R = 6371  # Earth's radius in kilometers

    # Convert to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    distance = 2 * R * atan2(sqrt(a), sqrt(1-a))

    return distance

In [14]:
# Step 6: Google Maps API Integration
class GoogleMapsIntegration:
    def __init__(self, api_key):
        """Initialize Google Maps client"""
        self.gmaps = googlemaps.Client(key=api_key)

    def get_distance_matrix(self, origins, destinations, mode='driving'):
        """Get distance matrix from Google Maps API"""
        try:
            result = self.gmaps.distance_matrix(
                origins=origins,
                destinations=destinations,
                mode=mode,
                units='metric',
                avoid='ferries'
            )
            return result
        except Exception as e:
            print(f"API Error: {e}")
            return None

    def extract_distance_time(self, matrix_result, origin_idx=0, dest_idx=0):
        """Extract distance and duration from API response"""
        try:
            element = matrix_result['rows'][origin_idx]['elements'][dest_idx]

            if element['status'] == 'OK':
                distance_km = element['distance']['value'] / 1000  # Convert to km
                duration_hours = element['duration']['value'] / 3600  # Convert to hours
                return distance_km, duration_hours
            else:
                return np.nan, np.nan
        except:
            return np.nan, np.nan

In [15]:
def extract_distance_time(self, matrix_result, origin_idx=0, dest_idx=0):
        """Extract distance and duration from API response"""
        try:
            element = matrix_result['rows'][origin_idx]['elements'][dest_idx]

            if element['status'] == 'OK':
                distance_km = element['distance']['value'] / 1000  # Convert to km
                duration_hours = element['duration']['value'] / 3600  # Convert to hours
                return distance_km, duration_hours
            else:
                return np.nan, np.nan
        except:
            return np.nan, np.nan

In [16]:
# Step 7: Create Distance Matrix for All Place Pairs
def create_distance_matrix(tourism_df, google_maps_api_key=None):
    """Create distance matrix between all tourist places"""
    print("Creating distance matrix...")

    # Get unique places with coordinates
    places = tourism_df[['place_name', 'city', 'latitude', 'longitude']].dropna()
    places = places.drop_duplicates(subset=['place_name'])

    distance_matrix = []

    # Initialize Google Maps if API key provided
    gmaps_client = None
    if google_maps_api_key:
        gmaps_client = GoogleMapsIntegration(google_maps_api_key)

    # Create all possible pairs
    for i, place1 in places.iterrows():
        for j, place2 in places.iterrows():
            if i != j:  # Don't calculate distance to itself

                # Calculate straight-line distance (always available)
                straight_distance = haversine_distance(
                    place1['latitude'], place1['longitude'],
                    place2['latitude'], place2['longitude']
                )

                # Try to get real-world distance from Google Maps
                real_distance = np.nan
                real_time = np.nan

                if gmaps_client and not pd.isna(straight_distance):
                    origin = f"{place1['latitude']},{place1['longitude']}"
                    destination = f"{place2['latitude']},{place2['longitude']}"

                    matrix_result = gmaps_client.get_distance_matrix([origin], [destination])
                    if matrix_result:
                        real_distance, real_time = gmaps_client.extract_distance_time(matrix_result)

                    # Add delay to respect API rate limits
                    time.sleep(0.1)

                distance_matrix.append({
                    'origin_place': place1['place_name'],
                    'origin_city': place1['city'],
                    'origin_lat': place1['latitude'],
                    'origin_lon': place1['longitude'],
                    'destination_place': place2['place_name'],
                    'destination_city': place2['city'],
                    'destination_lat': place2['latitude'],
                    'destination_lon': place2['longitude'],
                    'straight_line_distance_km': straight_distance,
                    'real_distance_km': real_distance,
                    'travel_time_hours': real_time,
                    'estimated_travel_time_hours': straight_distance / 60 if not pd.isna(straight_distance) else np.nan  # Assume 60 km/h average
                })

    distance_df = pd.DataFrame(distance_matrix)
    print(f"Distance matrix created: {distance_df.shape}")
    return distance_df


In [17]:
# Step 8: Add Cost Estimation
def add_cost_estimation(distance_df):
    """Add travel cost estimation"""
    print("Adding cost estimations...")

    # Cost per km for different transport modes
    cost_rates = {
        'car_fuel_per_km': 8,      # ₹8 per km (fuel + wear)
        'train_per_km': 2,         # ₹2 per km
        'bus_per_km': 3,           # ₹3 per km
        'flight_base': 3000,       # ₹3000 base + distance factor
        'flight_per_km': 4         # Additional ₹4 per km
    }

    distance_df['car_travel_cost'] = distance_df['real_distance_km'] * cost_rates['car_fuel_per_km']
    distance_df['train_travel_cost'] = distance_df['real_distance_km'] * cost_rates['train_per_km']
    distance_df['bus_travel_cost'] = distance_df['real_distance_km'] * cost_rates['bus_per_km']
    distance_df['flight_travel_cost'] = (cost_rates['flight_base'] +
                                        distance_df['real_distance_km'] * cost_rates['flight_per_km'])

    # Fill missing real distances with straight-line distances for cost calculation
    distance_df['effective_distance'] = distance_df['real_distance_km'].fillna(
        distance_df['straight_line_distance_km'] * 1.3  # Add 30% for road routes
    )

    return distance_df

In [27]:
# Step 9: Main Pipeline Function
def create_final_tourism_dataset(google_maps_api_key=None):
    """Main function to create the final merged dataset"""
    print("=== Starting Tourism Dataset Creation Pipeline ===")

    # Load all datasets
    df1, df2, df3, df4 = load_datasets()

    # Clean and standardize
    df1, df2, df3, df4 = standardize_datasets(df1, df2, df3, df4)

    # Merge tourism data
    tourism_merged = merge_tourism_data(df1, df2)

    # Add city coordinates
    final_tourism = add_city_coordinates(tourism_merged, df4)

    # Create distance matrix with Google Maps integration
    distance_matrix = create_distance_matrix(final_tourism, google_maps_api_key)

    # Add cost estimations
    final_distance_matrix = add_cost_estimation(distance_matrix)

    print("=== Pipeline Complete! ===")
    print(f"Final tourism places dataset: {final_tourism.shape}")
    print(f"Final distance matrix dataset: {final_distance_matrix.shape}")

    return final_tourism, final_distance_matrix
