In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("hotels_users_ratings.csv")

In [3]:
print(df.columns.to_list())

['URL Hotel', 'Location', 'HotelID', 'Name Hotel', 'Descriptions', 'Address', 'UserID', 'User', 'Rating']


# # Bỏ cột UserID, User và URL

In [4]:
df = df.drop(columns=["UserID", "User", "URL Hotel"])

In [5]:
print(df.columns.to_list())

['Location', 'HotelID', 'Name Hotel', 'Descriptions', 'Address', 'Rating']


# # Kiểm tra dataset có miss dữ liệu ko

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38801 entries, 0 to 38800
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Location      38801 non-null  object
 1   HotelID       38801 non-null  int64 
 2   Name Hotel    38801 non-null  object
 3   Descriptions  38801 non-null  object
 4   Address       38801 non-null  object
 5   Rating        38801 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.8+ MB


# # Kiểm tra có dòng nào bị NaN không

In [7]:
print(df.isnull().sum())

Location        0
HotelID         0
Name Hotel      0
Descriptions    0
Address         0
Rating          0
dtype: int64


# # Gộp theo HotelID

In [8]:
df_grouped = df.groupby(
    ["HotelID", "Name Hotel", "Address", "Location"],
    as_index=False
).agg({
    "Descriptions": lambda x: " ".join(set(x)),
    "Rating": "mean"  # lấy trung bình
})

df_grouped["Rating"] = df_grouped["Rating"].round(1)

In [9]:
df_grouped.head(5).T

Unnamed: 0,0,1,2,3,4
HotelID,1,2,3,4,5
Name Hotel,Bebe Homestay Phu Quoc,Coucou Homestead - I,Sapphire Boutique Hotel Da Nang,Saigon Amigo Hotel,Hanoi Graceful Hotel
Address,"To 4 ap Ganh Gio Xa Cua Duong, Ong Lang, Phú Q...","121/35 Lê Thị Riêng, Quận 1, TP. Hồ Chí Minh...","19 Le Binh Street, An Hai Bac Ward, Son Tra Di...","185/24 Pham Ngu Lao, Quận 1, TP. Hồ Chí Minh...","21 Hang Phen, Quận Hoàn Kiếm, Hà Nội, Việt Nam"
Location,Phú Quốc,TP. Hồ Chí Minh,Đà Nẵng,TP. Hồ Chí Minh,Hà Nội
Descriptions,"Nằm trên đảo Phú Quốc, Bebe Homestay Phu Quoc ...",Nằm cách Chợ ẩm thực đường phố Bến Thành chưa ...,"Tọa lạc tại thành phố Đà Nẵng, cách Bãi biển M...",Saigon Amigo Hotel có các phòng nghỉ máy lạnh ...,"Tọa lạc ở trung tâm Quận Hoàn Kiếm, Hanoi Grac..."
Rating,8.1,7.0,8.0,7.1,7.2


In [10]:
# df_grouped = df_grouped.reset_index(drop=True)

In [11]:
df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4533 entries, 0 to 4532
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HotelID       4533 non-null   int64  
 1   Name Hotel    4533 non-null   object 
 2   Address       4533 non-null   object 
 3   Location      4533 non-null   object 
 4   Descriptions  4533 non-null   object 
 5   Rating        4533 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 212.6+ KB


In [12]:
df_grouped.describe()

Unnamed: 0,HotelID,Rating
count,4533.0,4533.0
mean,2252.565409,7.299713
std,1301.664265,0.958663
min,1.0,4.0
25%,1124.0,6.7
50%,2253.0,7.3
75%,3380.0,8.0
max,4506.0,10.0


In [13]:
# Xem số hàng và cột của DataFrame
df_grouped.shape

(4533, 6)

In [14]:
# Kiểm tra có dòng dữ liệu nào bị lặp ko
df_grouped.duplicated().sum()

np.int64(0)

## Loại bỏ những dòng dữ liệu có chứa mô tả là tiếng anh

In [21]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ------------------------------------- 981.5/981.5 kB 10.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993250 sha256=f98cf04435a28a376f9fd80365d0886dce775ad0247ca35a30b35980cf686139
  Stored in directory: c:\users\thanh\appdata\local\pip\cache\wheels\eb\87\25\2dddf1c94e1786054e25022ec5530bfed52bad86d882999c48
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Note: you may need to restart the kernel to use updated packages.


  DEPRECATION: Building 'langdetect' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'langdetect'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [15]:
from langdetect import detect, DetectorFactory

In [17]:
DetectorFactory.seed = 0  # để kết quả ổn định

def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

df_grouped["IsEnglish"] = df_grouped["Descriptions"].apply(is_english)
print(df_grouped["IsEnglish"].value_counts())

IsEnglish
False    3896
True      637
Name: count, dtype: int64


In [18]:
# Giữ lại những dòng là tiếng việt
df_grouped = df_grouped[df_grouped["IsEnglish"] == False]

# Reset lại index cho đẹp
df_grouped = df_grouped.reset_index(drop=True)

In [19]:
print(df_grouped["IsEnglish"].value_counts())

IsEnglish
False    3896
Name: count, dtype: int64


In [20]:
# Xuất ra file CSV mới
df_grouped.to_csv("dataset.csv", index=False, encoding="utf-8-sig")