<a href="https://colab.research.google.com/github/dharalakshmi/Tourism-Routes/blob/main/API%20based%20data%20collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install requests beautifulsoup4 selenium pandas openpyxl


Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post

In [2]:
pip install googlemaps


Collecting googlemaps
  Downloading googlemaps-4.10.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlemaps
  Building wheel for googlemaps (setup.py) ... [?25l[?25hdone
  Created wheel for googlemaps: filename=googlemaps-4.10.0-py3-none-any.whl size=40714 sha256=b7027e904776bcc46980b47b38b5ac1479a42719dad717983b30f423df00d522
  Stored in directory: /root/.cache/pip/wheels/4c/6a/a7/bbc6f5c200032025ee655deb5e163ce8594fa05e67d973aad6
Successfully built googlemaps
Installing collected packages: googlemaps
Successfully installed googlemaps-4.10.0


In [10]:
import requests
import pandas as pd
import time
import random
from itertools import combinations

class RealTravelDataCollector:
    """
    Collects REAL travel data using free APIs
    Sources:
    1. OSRM (Open Source Routing Machine) - Free distance/duration
    2. Nominatim (OpenStreetMap) - Free geocoding
    3. OpenWeatherMap - Free location data (with free tier)
    """

    def __init__(self):
        self.data = []
        self.osrm_base = "http://router.project-osrm.org"
        self.nominatim_base = "https://nominatim.openstreetmap.org"

        # Add delays to respect API rate limits
        self.request_delay = 1  # seconds between requests

    def get_indian_cities(self):
        """Extended list of Indian cities for route generation"""
        return [
            "Delhi", "Mumbai", "Bangalore", "Chennai", "Kolkata", "Hyderabad",
            "Pune", "Ahmedabad", "Jaipur", "Surat", "Lucknow", "Kanpur",
            "Nagpur", "Indore", "Bhopal", "Visakhapatnam", "Patna", "Vadodara",
            "Ghaziabad", "Ludhiana", "Agra", "Nashik", "Meerut", "Rajkot",
            "Varanasi", "Srinagar", "Aurangabad", "Amritsar", "Ranchi", "Coimbatore",
            "Jabalpur", "Gwalior", "Vijayawada", "Jodhpur", "Madurai", "Raipur",
            "Kota", "Chandigarh", "Guwahati", "Solapur", "Hubli", "Mysore",
            "Thiruvananthapuram", "Bareilly", "Moradabad", "Mangalore", "Tiruchirappalli",
            "Salem", "Tiruppur", "Guntur", "Kurnool", "Nellore", "Rajahmundry",
            "Tirupati", "Shimla", "Dehradun", "Haridwar", "Rishikesh", "Manali",
            "Dharamshala", "Jammu", "Leh", "Goa", "Panaji", "Margao",
            "Kochi", "Kozhikode", "Thrissur", "Kollam", "Alappuzha", "Kannur",
            "Udaipur", "Ajmer", "Bikaner", "Pushkar", "Mount Abu", "Jaisalmer"
        ]

    def geocode_city(self, city_name):
        """Get coordinates for a city using Nominatim API"""
        try:
            url = f"{self.nominatim_base}/search"
            params = {
                'q': f"{city_name}, India",
                'format': 'json',
                'limit': 1,
                'addressdetails': 1
            }

            headers = {'User-Agent': 'TravelDataCollector/1.0'}
            response = requests.get(url, params=params, headers=headers)

            if response.status_code == 200:
                data = response.json()
                if data:
                    location = data[0]
                    lat = float(location['lat'])
                    lon = float(location['lon'])

                    # Extract state from address
                    address = location.get('address', {})
                    state = address.get('state', 'Unknown')

                    return lat, lon, state

            time.sleep(self.request_delay)  # Rate limiting
            return None, None, 'Unknown'

        except Exception as e:
            print(f"Error geocoding {city_name}: {e}")
            return None, None, 'Unknown'

    def get_route_data(self, origin_coords, dest_coords):
        """Get route distance and duration using OSRM API"""
        try:
            if not all(origin_coords + dest_coords):
                return None, None

            origin_lat, origin_lon = origin_coords
            dest_lat, dest_lon = dest_coords

            # OSRM route API
            url = f"{self.osrm_base}/route/v1/driving/{origin_lon},{origin_lat};{dest_lon},{dest_lat}"
            params = {
                'overview': 'false',
                'geometries': 'geojson'
            }

            response = requests.get(url, params=params)

            if response.status_code == 200:
                data = response.json()
                if data['routes']:
                    route = data['routes'][0]
                    distance = round(route['distance'] / 1000, 2)  # Convert to km
                    duration = round(route['duration'] / 3600, 2)  # Convert to hours
                    return distance, duration

            time.sleep(self.request_delay)  # Rate limiting
            return None, None

        except Exception as e:
            print(f"Error getting route data: {e}")
            return None, None

    def generate_ratings(self, distance, duration):
        """Generate realistic ratings based on route characteristics"""
        base_rating = 3.8

        # Better ratings for medium distance routes
        if 100 <= distance <= 600:
            base_rating += 0.4
        elif distance > 1000:
            base_rating -= 0.2

        # Factor in duration efficiency
        if duration and distance:
            speed = distance / duration
            if 40 <= speed <= 80:  # Good highway speed
                base_rating += 0.3

        # Add randomness
        rating = base_rating + random.uniform(-0.5, 0.7)
        return round(min(5.0, max(1.0, rating)), 1)

    def determine_route_type(self, distance, duration):
        """Determine route type based on distance and speed"""
        if not distance or not duration:
            return "Local Route"

        avg_speed = distance / duration if duration > 0 else 0

        if distance < 50:
            return random.choice(["City Route", "Local", "Metro"])
        elif distance < 200:
            if avg_speed > 60:
                return "Express Highway"
            else:
                return random.choice(["State Highway", "Regional"])
        elif distance < 500:
            if avg_speed > 70:
                return "National Highway"
            else:
                return random.choice(["State Highway", "Tourist Route"])
        else:
            return random.choice(["National Highway", "Interstate", "Long Distance"])

    def get_best_time_to_visit(self, dest_state):
        """Get best time to visit based on destination state"""
        seasonal_preferences = {
            "Rajasthan": "October-March",
            "Jammu and Kashmir": "April-September",
            "Himachal Pradesh": "March-June, September-November",
            "Kerala": "September-March",
            "Goa": "November-February",
            "Tamil Nadu": "November-March",
            "Karnataka": "October-March",
            "Maharashtra": "November-February",
            "Gujarat": "November-February",
            "Assam": "October-April",
            "West Bengal": "October-March",
            "Uttar Pradesh": "October-March",
            "Punjab": "October-April"
        }

        for state_key in seasonal_preferences:
            if state_key.lower() in dest_state.lower():
                return seasonal_preferences[state_key]

        return random.choice([
            "October-March", "November-February", "All Year", "October-April"
        ])

    def calculate_ideal_duration(self, distance, actual_duration):
        """Calculate ideal trip duration including sightseeing"""
        if not distance:
            return "1 day"

        # Base travel time plus sightseeing
        base_days = max(1, int(distance / 300))  # Assuming comfortable 300km per day

        if distance < 100:
            return random.choice(["Half day", "1 day"])
        elif distance < 300:
            return f"{base_days} day{'s' if base_days > 1 else ''}"
        elif distance < 600:
            sightseeing_days = base_days + 1
            return f"{sightseeing_days} days"
        else:
            sightseeing_days = base_days + random.randint(1, 2)
            return f"{sightseeing_days} days"

    def collect_real_travel_data(self, target_rows=5000):
        """Collect real travel data using APIs"""
        cities = self.get_indian_cities()
        print(f"Starting collection of {target_rows} real travel routes...")
        print(f"Using {len(cities)} Indian cities as sources")

        # First, geocode all cities
        print("Step 1: Geocoding all cities...")
        city_data = {}
        for i, city in enumerate(cities):
            print(f"Geocoding {i+1}/{len(cities)}: {city}")
            lat, lon, state = self.geocode_city(city)
            if lat and lon:
                city_data[city] = {'lat': lat, 'lon': lon, 'state': state}
            time.sleep(self.request_delay)

        print(f"Successfully geocoded {len(city_data)} cities")

        # Generate routes
        print("Step 2: Generating routes and collecting data...")
        valid_cities = list(city_data.keys())
        count = 0

        while count < target_rows and len(valid_cities) >= 2:
            # Select random origin and destination
            origin_city = random.choice(valid_cities)
            dest_city = random.choice(valid_cities)

            if origin_city == dest_city:
                continue

            origin_data = city_data[origin_city]
            dest_data = city_data[dest_city]

            # Get route data from OSRM
            distance, duration = self.get_route_data(
                (origin_data['lat'], origin_data['lon']),
                (dest_data['lat'], dest_data['lon'])
            )

            if distance and duration and distance > 5:  # Valid route with minimum distance
                route_type = self.determine_route_type(distance, duration)
                rating = self.generate_ratings(distance, duration)

                route_data = {
                    'Origin': origin_city,
                    'Destination': dest_city,
                    'Distance': distance,
                    'Origin_Lat': origin_data['lat'],
                    'Origin_Long': origin_data['lon'],
                    'Origin_State': origin_data['state'],
                    'Dest_Lat': dest_data['lat'],
                    'Dest_Long': dest_data['lon'],
                    'Dest_State': dest_data['state'],
                    'Ratings': rating,
                    'Ideal_duration': self.calculate_ideal_duration(distance, duration),
                    'Name': f"{origin_city} to {dest_city} via {route_type}",
                    'Type': route_type,
                    'Best Time to visit': self.get_best_time_to_visit(dest_data['state'])
                }

                self.data.append(route_data)
                count += 1

                if count % 50 == 0:
                    print(f"Collected {count}/{target_rows} routes...")

            # Rate limiting
            time.sleep(self.request_delay)

        return self.data

    def save_to_excel(self, filename="real_travel_data_5000.xlsx"):
        """Save the collected data to Excel"""
        df = pd.DataFrame(self.data)

        # Remove any potential duplicates
        df = df.drop_duplicates(subset=['Origin', 'Destination'])

        # Save to Excel
        df.to_excel(filename, index=False)

        print(f"\n" + "="*60)
        print(f"SUCCESS! Real travel data saved to {filename}")
        print(f"Total routes collected: {len(df)}")
        print(f"Data sources used:")
        print(f"  - OSRM API (Open Source Routing Machine) for distances")
        print(f"  - Nominatim API (OpenStreetMap) for geocoding")
        print(f"  - Generated realistic ratings and travel info")
        print("="*60)

        print(f"\nDataset Summary:")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print(f"\nFirst 5 rows:")
        print(df.head())

        return df

# Usage
if __name__ == "__main__":
    collector = RealTravelDataCollector()

    print("REAL TRAVEL DATA COLLECTION")
    print("This will use actual APIs to get real distance and location data")
    print("Estimated time: 2-3 hours for 5000 routes (due to API rate limits)")
    print("\nAPIs used:")
    print("1. OSRM (router.project-osrm.org) - FREE routing")
    print("2. Nominatim (OpenStreetMap) - FREE geocoding")

    proceed = input("\nDo you want to proceed? (y/n): ").lower().strip()

    if proceed == 'y':
        try:
            data = collector.collect_real_travel_data(5000)
            df = collector.save_to_excel()

        except KeyboardInterrupt:
            print(f"\nCollection stopped. Saving {len(collector.data)} routes collected so far...")
            if collector.data:
                df = collector.save_to_excel(f"partial_travel_data_{len(collector.data)}.xlsx")
        except Exception as e:
            print(f"Error: {e}")
    else:
        print("Collection cancelled.")

REAL TRAVEL DATA COLLECTION
This will use actual APIs to get real distance and location data
Estimated time: 2-3 hours for 5000 routes (due to API rate limits)

APIs used:
1. OSRM (router.project-osrm.org) - FREE routing
2. Nominatim (OpenStreetMap) - FREE geocoding
Starting collection of 5000 real travel routes...
Using 77 Indian cities as sources
Step 1: Geocoding all cities...
Geocoding 1/77: Delhi
Geocoding 2/77: Mumbai
Geocoding 3/77: Bangalore
Geocoding 4/77: Chennai
Geocoding 5/77: Kolkata
Geocoding 6/77: Hyderabad
Geocoding 7/77: Pune
Geocoding 8/77: Ahmedabad
Geocoding 9/77: Jaipur
Geocoding 10/77: Surat
Geocoding 11/77: Lucknow
Geocoding 12/77: Kanpur
Geocoding 13/77: Nagpur
Geocoding 14/77: Indore
Geocoding 15/77: Bhopal
Geocoding 16/77: Visakhapatnam
Geocoding 17/77: Patna
Geocoding 18/77: Vadodara
Geocoding 19/77: Ghaziabad
Geocoding 20/77: Ludhiana
Geocoding 21/77: Agra
Geocoding 22/77: Nashik
Geocoding 23/77: Meerut
Geocoding 24/77: Rajkot
Geocoding 25/77: Varanasi
Geoco

In [11]:
from google.colab import files

# Download the generated Excel file
files.download("real_travel_data_5000.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# Excel Download and Data Management
import pandas as pd
from google.colab import files
import os
from datetime import datetime

class ExcelDownloadManager:
    """Handle Excel file operations and downloads in Google Colab"""

    def __init__(self, df=None):
        self.df = df

    def save_and_download(self, df, filename=None):
        """Save DataFrame to Excel and download from Colab"""

        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"travel_routes_data_{timestamp}.xlsx"

        # Ensure .xlsx extension
        if not filename.endswith('.xlsx'):
            filename += '.xlsx'

        print(f"Preparing to save and download: {filename}")

        # Save to Excel with multiple sheets if large dataset
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            # Main data sheet
            df.to_excel(writer, sheet_name='Travel_Routes', index=False)

            # Summary statistics sheet
            summary_stats = self.create_summary_stats(df)
            summary_stats.to_excel(writer, sheet_name='Summary_Stats', index=True)

            # State-wise analysis sheet
            state_analysis = self.create_state_analysis(df)
            state_analysis.to_excel(writer, sheet_name='State_Analysis', index=False)

        print(f"✅ File saved successfully: {filename}")
        print(f"📊 Dataset shape: {df.shape}")
        print(f"📁 File size: {os.path.getsize(filename) / 1024:.1f} KB")

        # Download file in Google Colab
        try:
            files.download(filename)
            print(f"⬇️ Download initiated for: {filename}")
        except Exception as e:
            print(f"❌ Download failed: {e}")
            print("💡 You can manually download the file from the file browser on the left")

        return filename

    def create_summary_stats(self, df):
        """Create summary statistics for the dataset"""
        summary = pd.DataFrame({
            'Metric': [
                'Total Routes',
                'Unique Origins',
                'Unique Destinations',
                'Average Distance (km)',
                'Max Distance (km)',
                'Min Distance (km)',
                'Average Rating',
                'Max Rating',
                'Min Rating',
                'States Covered (Origin)',
                'States Covered (Destination)',
                'Most Common Route Type',
                'Average Latitude Range',
                'Average Longitude Range'
            ],
            'Value': [
                len(df),
                df['Origin'].nunique(),
                df['Destination'].nunique(),
                round(df['Distance'].mean(), 2),
                df['Distance'].max(),
                df['Distance'].min(),
                round(df['Ratings'].mean(), 2),
                df['Ratings'].max(),
                df['Ratings'].min(),
                df['Origin_State'].nunique(),
                df['Dest_State'].nunique(),
                df['Type'].mode().iloc[0],
                round((df['Origin_Lat'].max() - df['Origin_Lat'].min()), 2),
                round((df['Origin_Long'].max() - df['Origin_Long'].min()), 2)
            ]
        })

        return summary.set_index('Metric')

    def create_state_analysis(self, df):
        """Create state-wise analysis"""
        origin_counts = df['Origin_State'].value_counts().reset_index()
        origin_counts.columns = ['State', 'Origin_Routes']

        dest_counts = df['Dest_State'].value_counts().reset_index()
        dest_counts.columns = ['State', 'Destination_Routes']

        # Merge origin and destination counts
        state_analysis = pd.merge(origin_counts, dest_counts, on='State', how='outer')
        state_analysis = state_analysis.fillna(0)
        state_analysis['Total_Routes'] = state_analysis['Origin_Routes'] + state_analysis['Destination_Routes']
        state_analysis = state_analysis.sort_values('Total_Routes', ascending=False)

        return state_analysis

    def create_enhanced_excel(self, df, filename=None):
        """Create an enhanced Excel file with formatting and charts"""

        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"enhanced_travel_data_{timestamp}.xlsx"

        if not filename.endswith('.xlsx'):
            filename += '.xlsx'

        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            # Main data
            df.to_excel(writer, sheet_name='Travel_Routes', index=False)

            # Summary statistics
            summary_stats = self.create_summary_stats(df)
            summary_stats.to_excel(writer, sheet_name='Summary_Stats')

            # State analysis
            state_analysis = self.create_state_analysis(df)
            state_analysis.to_excel(writer, sheet_name='State_Analysis', index=False)

            # Distance distribution
            distance_bins = pd.cut(df['Distance'], bins=10).value_counts().reset_index()
            distance_bins.columns = ['Distance_Range', 'Count']
            distance_bins.to_excel(writer, sheet_name='Distance_Distribution', index=False)

            # Rating distribution
            rating_dist = df['Ratings'].value_counts().reset_index().sort_values('count')
            rating_dist.columns = ['Rating', 'Count']
            rating_dist.to_excel(writer, sheet_name='Rating_Distribution', index=False)

            # Route type analysis
            route_type_analysis = df.groupby('Type').agg({
                'Distance': ['count', 'mean', 'min', 'max'],
                'Ratings': 'mean'
            }).round(2)
            route_type_analysis.to_excel(writer, sheet_name='Route_Type_Analysis')

        print(f"✅ Enhanced Excel file created: {filename}")

        # Download
        try:
            files.download(filename)
            print(f"⬇️ Enhanced file downloaded: {filename}")
        except Exception as e:
            print(f"❌ Download failed: {e}")

        return filename

# Usage functions for your existing data
def download_current_data():
    """Download the data that was just collected"""

    # Check if the file exists
    if os.path.exists('real_travel_data_5000.xlsx'):
        print("Found existing file: real_travel_data_5000.xlsx")

        # Read the data
        df = pd.read_excel('real_travel_data_5000.xlsx')
        print(f"Loaded {len(df)} rows of data")

        # Create download manager
        manager = ExcelDownloadManager()

        # Create enhanced version and download
        enhanced_file = manager.create_enhanced_excel(df, "my_travel_analysis_data.xlsx")

        return df
    else:
        print("❌ File 'real_travel_data_5000.xlsx' not found")
        print("Available files:")
        for file in os.listdir('.'):
            if file.endswith('.xlsx'):
                print(f"  - {file}")

def create_custom_download(df, custom_filename="custom_travel_data.xlsx"):
    """Create a custom download with your specific requirements"""

    manager = ExcelDownloadManager()

    print("Creating custom Excel file with:")
    print("✅ All your 14 columns")
    print("✅ Summary statistics")
    print("✅ State-wise analysis")
    print("✅ Distance and rating distributions")
    print("✅ Route type analysis")

    return manager.create_enhanced_excel(df, custom_filename)

def quick_download():
    """Quick download of the current data"""
    try:
        # Try to find the most recent Excel file
        excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]

        if excel_files:
            latest_file = max(excel_files, key=os.path.getctime)
            print(f"Found latest Excel file: {latest_file}")

            # Download it
            files.download(latest_file)
            print(f"⬇️ Downloaded: {latest_file}")
        else:
            print("❌ No Excel files found in current directory")

    except Exception as e:
        print(f"❌ Quick download failed: {e}")

# Main execution
if __name__ == "__main__":
    print("🚀 EXCEL DOWNLOAD MANAGER")
    print("=" * 50)

    # Option 1: Download existing file
    print("\n1️⃣ Downloading your collected travel data...")
    df = download_current_data()

    if df is not None:
        print(f"\n📊 Your dataset overview:")
        print(f"   • Total routes: {len(df)}")
        print(f"   • Columns: {len(df.columns)}")
        print(f"   • Distance range: {df['Distance'].min():.1f} - {df['Distance'].max():.1f} km")
        print(f"   • Rating range: {df['Ratings'].min()} - {df['Ratings'].max()}")
        print(f"   • States covered: {df['Dest_State'].nunique()}")

        # Option 2: Create additional custom download
        print(f"\n2️⃣ Creating additional custom download...")
        custom_file = create_custom_download(df, "final_travel_dataset_for_analysis.xlsx")

    print(f"\n✅ All done! Check your downloads folder.")

🚀 EXCEL DOWNLOAD MANAGER

1️⃣ Downloading your collected travel data...
Found existing file: real_travel_data_5000.xlsx
Loaded 3389 rows of data
✅ Enhanced Excel file created: my_travel_analysis_data.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️ Enhanced file downloaded: my_travel_analysis_data.xlsx

📊 Your dataset overview:
   • Total routes: 3389
   • Columns: 14
   • Distance range: 13.4 - 3620.7 km
   • Rating range: 3.1 - 5.0
   • States covered: 23

2️⃣ Creating additional custom download...
Creating custom Excel file with:
✅ All your 14 columns
✅ Summary statistics
✅ State-wise analysis
✅ Distance and rating distributions
✅ Route type analysis
✅ Enhanced Excel file created: final_travel_dataset_for_analysis.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️ Enhanced file downloaded: final_travel_dataset_for_analysis.xlsx

✅ All done! Check your downloads folder.
