In [None]:
import pandas as pd
import numpy as np

# Upload your files:
# Go to the 'Files' tab in Colab and upload files, or use:
from google.colab import files
uploaded = files.upload()

# Load datasets
cities_db = pd.read_csv('Indian Cities Database.csv')           # City, Lat, Long, State, etc.
city_distances = pd.read_csv('indian-cities-dataset.csv')       # Origin, Destination, Distance
tourist_details = pd.read_csv('City.csv')                       # City, Ratings, Ideal_duration, etc.
top_places = pd.read_csv('Top Indian Places to Visit.csv')      # Zone, State, City, Name, Type, etc.



Saving Indian Cities Database.csv to Indian Cities Database (1).csv
Saving indian-cities-dataset.csv to indian-cities-dataset (1).csv
Saving City.csv to City (1).csv
Saving Top Indian Places to Visit.csv to Top Indian Places to Visit (1).csv


In [None]:
print(cities_db.head())
print(city_distances.head())
print(tourist_details.head())
print(top_places.head())



         City        Lat       Long country iso2          State
0      Abohar  30.144533  74.195520   India   IN         Punjab
1    Adilabad  19.400000  78.310000   India   IN      Telangana
2    Agartala  23.836049  91.279386   India   IN        Tripura
3        Agra  27.187935  78.003944   India   IN  Uttar Pradesh
4  Ahmadnagar  19.094571  74.738432   India   IN    Maharashtra
      Origin Destination  Distance
0       Agra       Delhi       240
1       Agra     Lucknow       334
2       Agra      Kanpur       277
3  Ahmedabad      Mumbai       526
4  Ahmedabad        Pune       663
          City  Ratings Ideal_duration  Best_time_to_visit  \
0       Manali      4.5            2-4        October-June   
1   Leh Ladakh      4.6            5-7         JulyOctober   
2        Coorg      4.2            2-3      September-June   
3      Andaman      4.5            4-6       October-March   
4  Lakshadweep      4.0            4-6  September-February   

                                 

In [None]:
# Remove duplicates
cities_db.drop_duplicates(inplace=True)
city_distances.drop_duplicates(inplace=True)
tourist_details.drop_duplicates(inplace=True)
top_places.drop_duplicates(inplace=True)

# Standardize city names for consistent merging
def clean_city(x):
    return str(x).strip().title()

cities_db['City'] = cities_db['City'].apply(clean_city)
city_distances['Origin'] = city_distances['Origin'].apply(clean_city)
city_distances['Destination'] = city_distances['Destination'].apply(clean_city)
tourist_details['City'] = tourist_details['City'].apply(clean_city)
top_places['City'] = top_places['City'].apply(clean_city)


In [None]:
print(cities_db[['City']].head(10))
print(city_distances[['Origin', 'Destination']].head(10))
print(tourist_details[['City']].head(10))
print(top_places[['City']].head(10))


         City
0      Abohar
1    Adilabad
2    Agartala
3        Agra
4  Ahmadnagar
5   Ahmedabad
6      Aizawl
7       Ajmer
8       Akola
9   Alappuzha
      Origin Destination
0       Agra       Delhi
1       Agra     Lucknow
2       Agra      Kanpur
3  Ahmedabad      Mumbai
4  Ahmedabad        Pune
5  Ahmedabad      Jaipur
6  Ahmedabad     Udaipur
7  Bengaluru        Pune
8  Bengaluru   Hyderabad
9  Bengaluru     Chennai
          City
0       Manali
1   Leh Ladakh
2        Coorg
3      Andaman
4  Lakshadweep
5          Goa
6      Udaipur
7     Srinagar
8      Gangtok
9       Munnar
    City
0  Delhi
1  Delhi
2  Delhi
3  Delhi
4  Delhi
5  Delhi
6  Delhi
7  Delhi
8  Delhi
9  Delhi


In [None]:
print(cities_db['City'].duplicated().sum())
print(city_distances.duplicated().sum())
print(tourist_details.duplicated().sum())
print(top_places.duplicated().sum())


1
0
0
0


In [None]:
print(cities_db['City'].unique()[:20])
print(city_distances['Origin'].unique()[:20])
print(city_distances['Destination'].unique()[:20])


['Abohar' 'Adilabad' 'Agartala' 'Agra' 'Ahmadnagar' 'Ahmedabad' 'Aizawl'
 'Ajmer' 'Akola' 'Alappuzha' 'Aligarh' 'Alipurduar' 'Allahabad' 'Alwar'
 'Ambala' 'Amaravati' 'Amritsar' 'Asansol' 'Aurangabad' 'Bakshpur']
['Agra' 'Ahmedabad' 'Bengaluru' 'Bhubaneswar' 'Chennai' 'Delhi' 'Goa'
 'Hyderabad' 'Jaipur' 'Kanpur' 'Kochi' 'Kolkata' 'Lucknow' 'Mumbai'
 'Patna' 'Pune' 'Thiruvananthapuram' 'Udaipur' 'Varanasi' 'Vishakhapatnam']
['Delhi' 'Lucknow' 'Kanpur' 'Mumbai' 'Pune' 'Jaipur' 'Udaipur' 'Hyderabad'
 'Chennai' 'Goa' 'Kolkata' 'Vishakhapatnam' 'Patna' 'Bengaluru' 'Kochi'
 'Agra' 'Thiruvananthapuram' 'Ahmedabad' 'Varanasi' 'Bhubaneswar']


In [None]:
print(cities_db['City'].isnull().sum())
print(city_distances['Origin'].isnull().sum())
print(city_distances['Destination'].isnull().sum())


0
0
0


In [None]:
# Merge origin city info
city_distances = city_distances.merge(
    cities_db[['City', 'Lat', 'Long', 'State']],
    left_on='Origin', right_on='City', how='left'
).rename(columns={'Lat':'Origin_Lat', 'Long':'Origin_Long', 'State':'Origin_State'})
city_distances.drop('City', axis=1, inplace=True)

# Merge destination city info
city_distances = city_distances.merge(
    cities_db[['City', 'Lat', 'Long', 'State']],
    left_on='Destination', right_on='City', how='left'
).rename(columns={'Lat':'Dest_Lat', 'Long':'Dest_Long', 'State':'Dest_State'})
city_distances.drop('City', axis=1, inplace=True)


In [None]:
# Merge tourist details for destinations
merged_df = city_distances.merge(
    tourist_details, left_on='Destination', right_on='City', how='left'
)
merged_df.drop('City', axis=1, inplace=True)  # Remove duplicate 'City' column if present


In [None]:
# Select top places columns
places_df = top_places[['City', 'Name', 'Type', 'Best Time to visit']]

# Merge top places info
merged_df = merged_df.merge(
    places_df, left_on='Destination', right_on='City', how='left'
)
merged_df.drop('City', axis=1, inplace=True)


In [None]:
merged_df.to_csv('merged_tourism_data.csv', index=False)


In [None]:
print(merged_df.head())


  Origin Destination  Distance  Origin_Lat  Origin_Long   Origin_State  \
0   Agra       Delhi       240   27.187935    78.003944  Uttar Pradesh   
1   Agra       Delhi       240   27.187935    78.003944  Uttar Pradesh   
2   Agra       Delhi       240   27.187935    78.003944  Uttar Pradesh   
3   Agra       Delhi       240   27.187935    78.003944  Uttar Pradesh   
4   Agra       Delhi       240   27.187935    78.003944  Uttar Pradesh   

    Dest_Lat  Dest_Long Dest_State  Ratings Ideal_duration Best_time_to_visit  \
0  28.651952  77.231495      Delhi      4.1            3-5       OctoberMarch   
1  28.651952  77.231495      Delhi      4.1            3-5       OctoberMarch   
2  28.651952  77.231495      Delhi      4.1            3-5       OctoberMarch   
3  28.651952  77.231495      Delhi      4.1            3-5       OctoberMarch   
4  28.651952  77.231495      Delhi      4.1            3-5       OctoberMarch   

                                           City_desc                

In [None]:
print(merged_df.shape)
print(merged_df.columns)


(506, 16)
Index(['Origin', 'Destination', 'Distance', 'Origin_Lat', 'Origin_Long',
       'Origin_State', 'Dest_Lat', 'Dest_Long', 'Dest_State', 'Ratings',
       'Ideal_duration', 'Best_time_to_visit', 'City_desc', 'Name', 'Type',
       'Best Time to visit'],
      dtype='object')


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload your CSV file (do this only once)
uploaded = files.upload()  # This will prompt you to select a CSV file

# Step 2: Load the uploaded CSV file with correct encoding
merged_df = pd.read_csv('merged_tourism_data.csv', encoding='latin1')  # You can try 'ISO-8859-1' if this fails

# Step 3: Fill missing values in 'Ratings' column with the median
merged_df['Ratings'].fillna(merged_df['Ratings'].median(), inplace=True)

# Step 4: Save the cleaned DataFrame to a new CSV file
merged_df.to_csv('merged_tourism_data_cleaned.csv', index=False)

# Step 5: Download the cleaned CSV file
files.download('merged_tourism_data_cleaned.csv')


Saving merged_tourism_data.csv to merged_tourism_data (3).csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['Ratings'].fillna(merged_df['Ratings'].median(), inplace=True)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# For numeric column 'Ratings'
merged_df['Ratings'] = merged_df['Ratings'].fillna(merged_df['Ratings'].median())

# FOR 'Ideal_duration':
# If it is not already numeric, try to convert
merged_df['Ideal_duration'] = pd.to_numeric(merged_df['Ideal_duration'], errors='coerce')
merged_df['Ideal_duration'] = merged_df['Ideal_duration'].fillna(merged_df['Ideal_duration'].median())

# For categorical column 'Type'
merged_df['Type'] = merged_df['Type'].fillna(merged_df['Type'].mode()[0])

# For categorical column 'Best_time_to_visit'
merged_df['Best_time_to_visit'] = merged_df['Best_time_to_visit'].fillna(merged_df['Best_time_to_visit'].mode())


In [None]:
# 1. (Optional) Check for any remaining missing values
print(merged_df.isnull().sum())

# 2. Save cleaned data to new CSV
merged_df.to_csv('merged_tourism_data_cleaned.csv', index=False)

# 3. Download cleaned CSV file
from google.colab import files
files.download('merged_tourism_data_cleaned.csv')


Origin                  0
Destination             0
Distance                0
Origin_Lat              0
Origin_Long             0
Origin_State            0
Dest_Lat                0
Dest_Long               0
Dest_State              0
Ratings                 0
Ideal_duration        506
Best_time_to_visit    236
City_desc              27
Name                    1
Type                    0
Best Time to visit      1
dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>