In [13]:
import csv
import pandas as pd
from pandas_profiling import ProfileReport
from collections import defaultdict
import re

In [6]:
# Open the input file with UTF-8-SIG encoding
with open('hotel_info.csv', 'r', newline='', encoding='utf-8-sig') as input_file:
    reader = csv.DictReader(input_file)
    
    # Open the output file with UTF-8-SIG encoding
    with open('hotel_info_2.csv', 'w', newline='', encoding='utf-8-sig') as output_file:
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in reader:
            # Replace commas with dots in the specified columns
            row['Total_Score'] = row['Total_Score'].replace(',', '.')
            row['Location'] = row['Location'].replace(',', '.')
            row['Cleanliness'] = row['Cleanliness'].replace(',', '.')
            row['Service'] = row['Service'].replace(',', '.')
            row['Facilities'] = row['Facilities'].replace(',', '.')
            row['Value_for_money'] = row['Value_for_money'].replace(',', '.')
            row['Comfort_and_room_quality'] = row['Comfort_and_room_quality'].replace(',', '.')
            
            # Extract the first word from the Hotel_Rank column
            hotel_rank = row['Hotel_Rank']
            if hotel_rank != "No information":
                row['Hotel_Rank'] = hotel_rank.split()[0]
            else:
                row['Hotel_Rank'] = "0"
            
            writer.writerow(row)

In [14]:
def clean_text(text):
    """
    Clean the input text by removing leading/trailing whitespace and converting to lowercase.
    """
    return text.strip().lower()

# Open the input file with UTF-8-SIG encoding
with open('hotel_info_2.csv', 'r', newline='', encoding='utf-8-sig') as input_file:
    reader = csv.DictReader(input_file)
    
    # Create a dictionary to store the unique hotel records
    unique_hotels = defaultdict(list)
    
    # Open the output file with UTF-8-SIG encoding
    with open('hotel_info_3.csv', 'w', newline='', encoding='utf-8-sig') as output_file:
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in reader:
            # Clean the text in the row
            for field, value in row.items():
                row[field] = clean_text(value)
            
            hotel_name = row['Hotel_Name']
            hotel_address = row['Hotel_Address']
            
            # Check if the hotel is already in the dictionary
            if (hotel_name, hotel_address) in unique_hotels:
                # If the hotel is already in the dictionary, skip the first duplicate row
                if len(unique_hotels[(hotel_name, hotel_address)]) > 0:
                    continue
            
            # Add the hotel to the dictionary
            unique_hotels[(hotel_name, hotel_address)].append(row)
            
            # Write the row to the output file
            writer.writerow(row)

In [15]:
# Load the CSV file
df = pd.read_csv('hotel_info_3.csv')

# Generate the Pandas Profiling report
profile = ProfileReport(df, title='Hotel Information Report')

# Display the report
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]