In [7]:
# ============================================================
# Step 1 and 2: Load and Clean the dataset
# ============================================================
from cleaning import load_and_clean_data
df_clean = load_and_clean_data()

In [None]:
# ============================================================
# Step 3: Explore the structure (CLEANED DATA)
# ============================================================

import pandas as pd


print("=== DATASET STRUCTURE (Step 3 – CLEANED DATA) ===")

# Shape
print(f"Shape (rows, columns): {df_clean.shape}")

# Preview rows
print("\nFirst rows of cleaned data:")
display(df_clean.head(5))

# Column names and data types
print("\nColumn names and data types:")
dtypes_df = (
    pd.DataFrame({
        "column": df_clean.columns,
        "dtype": df_clean.dtypes.astype(str)
    })
    .reset_index(drop=True)
)
display(dtypes_df)

# Missing values (AFTER cleaning)
missing_count = df_clean.isna().sum()
missing_pct = (missing_count / len(df_clean) * 100).round(2)

missing_summary = (
    pd.DataFrame({
        "missing_count": missing_count,
        "missing_pct": missing_pct
    })
    .sort_values(by="missing_count", ascending=False)
)

print("\nMissing values summary (cleaned data – top 15):")
display(missing_summary.head(15))

=== DATASET STRUCTURE (Step 3 – CLEANED DATA) ===
Shape (rows, columns): (504, 20)

First rows of cleaned data:


Unnamed: 0,User country,Nr. reviews,Nr. hotel reviews,Helpful votes,Score,Period of stay,Traveler type,Pool,Gym,Tennis court,Spa,Casino,Free internet,Hotel name,Hotel stars,Nr. rooms,User continent,Member years,Review month,Review weekday
0,USA,11.0,4.0,13.0,5.0,Dec-Feb,Friends,False,True,False,False,True,True,Circus Circus Hotel & Casino Las Vegas,3.0,3773.0,North America,9.0,January,Thursday
1,USA,119.0,21.0,75.0,3.0,Dec-Feb,Business,False,True,False,False,True,True,Circus Circus Hotel & Casino Las Vegas,3.0,3773.0,North America,3.0,January,Friday
2,USA,36.0,9.0,25.0,5.0,Mar-May,Families,False,True,False,False,True,True,Circus Circus Hotel & Casino Las Vegas,3.0,3773.0,North America,2.0,February,Saturday
3,UK,14.0,7.0,14.0,4.0,Mar-May,Friends,False,True,False,False,True,True,Circus Circus Hotel & Casino Las Vegas,3.0,3773.0,Europe,6.0,February,Friday
4,Canada,5.0,5.0,2.0,4.0,Mar-May,Solo,False,True,False,False,True,True,Circus Circus Hotel & Casino Las Vegas,3.0,3773.0,North America,7.0,March,Tuesday



Column names and data types:


Unnamed: 0,column,dtype
0,User country,object
1,Nr. reviews,float64
2,Nr. hotel reviews,float64
3,Helpful votes,float64
4,Score,float64
5,Period of stay,object
6,Traveler type,object
7,Pool,bool
8,Gym,bool
9,Tennis court,bool



Missing values summary (cleaned data – top 15):


Unnamed: 0,missing_count,missing_pct
Spa,16,3.17
Score,13,2.58
Period of stay,10,1.98
User country,0,0.0
Helpful votes,0,0.0
Nr. hotel reviews,0,0.0
Nr. reviews,0,0.0
Traveler type,0,0.0
Pool,0,0.0
Gym,0,0.0


In [9]:
# ============================================================
# Step 4: Categorical Information
# ============================================================

print("=== STEP 4: CATEGORICAL INFORMATION ===")

# 1) How many unique hotel names are in the dataset?
n_hotels = df_clean["Hotel name"].nunique()
print(f"\nNumber of unique hotel names: {n_hotels}")

# 2) What traveler types are represented?
print("\nTraveler types represented:")
traveler_types = df_clean["Traveler type"].value_counts()
display(traveler_types)

# 3) What does each column describe?
print("\nColumn descriptions:")

column_descriptions = {
    "User country": "Country of the reviewer.",
    "User continent": "Continent or region of the reviewer.",
    "Nr. reviews": "Total number of reviews written by the user.",
    "Nr. hotel reviews": "Number of hotel reviews written by the user.",
    "Helpful votes": "Number of helpful votes received.",
    "Score": "Overall rating score given by the reviewer.",
    "Period of stay": "Time period during which the stay occurred.",
    "Traveler type": "Type of traveler (e.g., Couples, Business, Family).",
    "Pool": "Whether the hotel has a pool.",
    "Gym": "Whether the hotel has a gym.",
    "Tennis court": "Whether the hotel has a tennis court.",
    "Spa": "Whether the hotel has a spa.",
    "Casino": "Whether the hotel has a casino.",
    "Free internet": "Whether the hotel offers free internet.",
    "Hotel name": "Name of the hotel.",
    "Hotel stars": "Official star rating of the hotel.",
    "Nr. rooms": "Number of rooms in the hotel.",
    "Member years": "Number of years the user has been a member.",
    "Review month": "Month when the review was posted.",
    "Review weekday": "Weekday when the review was posted."
}

desc_df = pd.DataFrame({
    "Column": df_clean.columns,
    "Description": [
        column_descriptions.get(col, "No description available.")
        for col in df_clean.columns
    ]
})

display(desc_df)


=== STEP 4: CATEGORICAL INFORMATION ===

Number of unique hotel names: 21

Traveler types represented:


Traveler type
Couples     214
Families    110
Friends      82
Business     74
Solo         24
Name: count, dtype: int64


Column descriptions:


Unnamed: 0,Column,Description
0,User country,Country of the reviewer.
1,Nr. reviews,Total number of reviews written by the user.
2,Nr. hotel reviews,Number of hotel reviews written by the user.
3,Helpful votes,Number of helpful votes received.
4,Score,Overall rating score given by the reviewer.
5,Period of stay,Time period during which the stay occurred.
6,Traveler type,"Type of traveler (e.g., Couples, Business, Fam..."
7,Pool,Whether the hotel has a pool.
8,Gym,Whether the hotel has a gym.
9,Tennis court,Whether the hotel has a tennis court.
