In [8]:
import csv
import pandas as pd
from pandas_profiling import ProfileReport
from collections import defaultdict
import re
import langid

In [5]:
# Xóa dòng comment không còn HotelID trong 02hotels.csv
# Read the CSV files
hotel_info = pd.read_csv('hotel_info_3.csv')
hotel_comments = pd.read_csv('hotel_comments.csv')

# Get the unique Hotel IDs from 02hotels.csv
valid_hotel_ids = hotel_info['Hotel_ID'].unique()

# Drop the rows in hotel_comments where Hotel ID is not in the valid_hotel_ids
hotel_comments = hotel_comments[hotel_comments['Hotel ID'].isin(valid_hotel_ids)]

# Write the updated hotel_comments DataFrame to a new CSV file
hotel_comments.to_csv('hotel_comments_2.csv', index=False, encoding='utf-8-sig')

In [6]:
# Input file name
input_file = 'hotel_comments_2.csv'

# Output file name
output_file = 'hotel_comments_3.csv'

def clean_text(text):
    """
    Clean the text by removing HTML tags, converting to lowercase, and removing leading/trailing whitespace.
    """
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = text.lower().strip()  # Convert to lowercase and remove leading/trailing whitespace
    return text

# Read the CSV file and find duplicate rows
with open(input_file, 'r', newline='', encoding='utf-8-sig') as file:
    reader = csv.DictReader(file)
    rows = list(reader)

# Create a dictionary to store the unique rows
unique_rows = defaultdict(list)

# Find the duplicate rows
for row in rows:
    # Clean the text in the 'Body' column
    row['Body'] = clean_text(row['Body'])
    key = (row['Hotel ID'], row['Reviewer Name'], row['Body'])
    unique_rows[key].append(row)

# Write the unique rows to the output file
with open(output_file, 'w', newline='', encoding='utf-8-sig') as file:
    fieldnames = rows[0].keys()
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()

    for row_list in unique_rows.values():
        if len(row_list) > 1:
            # Keep the first row and discard the rest
            writer.writerow(row_list[0])
        else:
            # Write the single unique row
            writer.writerow(row_list[0])

In [7]:
#Xóa dòng blank trong cột Body
# Input file name
input_file = 'hotel_comments_3.csv'

# Output file name
output_file = 'hotel_comments_4.csv'

# Read the CSV file and filter out rows with blank "Body" values
with open(input_file, 'r', newline='', encoding='utf-8-sig') as file:
    reader = csv.DictReader(file)
    rows = [row for row in reader if row['Body'].strip()]

# Write the filtered data to a new CSV file
with open(output_file, 'w', newline='', encoding='utf-8-sig') as file:
    fieldnames = rows[0].keys()
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

In [9]:
# xác định ngôn ngữ của cột Body
# Read the CSV file
df = pd.read_csv('hotel_comments_4.csv')

# Clean the text in the 'Body' column
df['Body'] = df['Body'].astype(str).apply(lambda x: ' '.join(x.split()))

# Detect the language for each row in the 'Body' column
df['Language'] = df['Body'].apply(lambda x: langid.classify(x)[0])

# Write the updated DataFrame to a new CSV file
df.to_csv('hotel_comments_5.csv', index=False, encoding='utf-8-sig')

In [10]:
# Open the input file with UTF-8-SIG encoding
with open('hotel_comments_5.csv', 'r', newline='', encoding='utf-8-sig') as input_file:
    reader = csv.DictReader(input_file)
    
    # Open the output file with UTF-8-SIG encoding
    with open('hotel_comments_6.csv', 'w', newline='', encoding='utf-8-sig') as output_file:
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in reader:
            # Change commas to dots in the "Score" column
            row['Score'] = row['Score'].replace(',', '.')
            
            # Drop the row if the "Language" column is not "vi"
            if row['Language'] != 'vi':
                continue
            
            # Write the row to the output file
            writer.writerow(row)