In [2]:
import pandas as pd

In [None]:
# Read the CSV files
cal = pd.read_csv('data/raw/calendar2024.csv')
lis = pd.read_csv('data/raw/listings2024.csv') 
rev = pd.read_csv('data/raw/reviews2024.csv')
print("Listings shape:", lis.shape)
print("Calendar shape:", cal.shape) 
print("Reviews shape:", rev.shape)

In [None]:
for col in df.columns:
        n_unique = df[col].nunique()
        if n_unique < 3:
            unique_values = df[col].unique()
            print(f"{col}: {n_unique} unique values")
            print(f"Values: {unique_values}\n")

# Clean

In [None]:
null_cols_lis = lis.columns[lis.isna().all()].tolist()
lis = lis.drop(columns=null_cols_lis)

lis.drop(columns=['scrape_id', 'host_name', 'picture_url', 'host_url', 'host_thumbnail_url', 'host_picture_url'], inplace=True)
cal.drop(columns=['adjusted_price'], inplace=True)
rev.drop(columns=['reviewer_name'], inplace=True)


def convert_to_boolean(df, columns, true_value='t'):
    """Convert specified columns from string indicators to boolean"""
    for col in columns:
        df[col] = df[col] == true_value
    return df

boolean_cols = ['instant_bookable', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability']
lis = convert_to_boolean(lis, boolean_cols)
cal['available'] = cal['available'] == 't'


def convert_to_datetime(df, columns):
    """Convert specified columns to datetime"""
    for col in columns:
        df[col] = pd.to_datetime(df[col])
    return df

datetime_cols_lis = ['calendar_last_scraped', 'first_review', 'last_review', 'last_scraped', 'host_since']
lis = convert_to_datetime(lis, datetime_cols_lis)
cal['date'] = pd.to_datetime(cal['date'])
rev['date'] = pd.to_datetime(rev['date'])


def convert_to_type(df, columns, dtype):
    """Convert specified columns to given dtype"""
    for col in columns:
        df[col] = df[col].astype(dtype)
    return df

string_columns = ['bathrooms_text', 'neighbourhood', 'neighbourhood_cleansed', 'property_type', 'room_type', 'host_location', 'host_about', 'host_neighbourhood', 'listing_url', 'host_response_time', 'source', 'name','description','neighborhood_overview']
lis = convert_to_type(lis, string_columns, "string")
rev['comments'] = rev['comments'].astype("string")


percentage_cols = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_cols:
    lis = lis.rename(columns={col: f"{col}_pct"})
    lis[f"{col}_pct"] = lis[f"{col}_pct"].str.rstrip('%').astype('float') / 100

# Currency inconsistency adjustment
lis['price'] = lis['price'].str.replace(r'[\$,]', '', regex=True)
lis = lis.rename(columns={'price': 'price_DKK'})
lis['price_DKK'] = pd.to_numeric(lis['price_DKK'], errors='coerce')

cal['price'] = cal['price'].str.replace(r'[\$,]', '', regex=True)
cal = cal.rename(columns={'price': 'price_USD'})
cal['price_USD'] = pd.to_numeric(cal['price_USD'], errors='coerce')

########## Handling list columns ##########

lis['amenities_count'] = lis.amenities.str.strip('[]').str.split(',').str.len()


def clean_amenity(text):
    """Clean individual amenity strings"""
    import re
    text = str(text) # Convert to string if not already
    text = text.strip().strip('"\'').strip('.- ') # Basic cleaning
    text = text.encode('ascii', 'ignore').decode('ascii') # Replace unicode escape sequences with their characters
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters, keeping only alphanumeric and spaces
    text = text.lower().strip() # Convert to lowercase, strip again, and remove any remaining leading/trailing spaces
    return text

# Clean and convert amenities to comma-separated string
lis['amenities'] = lis['amenities'].str.strip('[]').str.split(',').apply(
    lambda x: ','.join(
        sorted(  # Sort for consistency
            filter(None,  # Remove empty strings
                [clean_amenity(item) for item in x]
            )
        )
    )
)

lis['amenities'] = lis['amenities'].astype('string') # Convert to string dtype

# Count number of verifications per host
lis['host_verifications_count'] = lis['host_verifications'].str.strip('[]').str.split(', ').str.len()
lis['host_verifications'] = lis['host_verifications'].str.strip('[]').str.replace("'", "").str.split(', ')
# Create one-hot encoded columns
verification_dummies = lis['host_verifications'].str.join('|').str.get_dummies()
verification_dummies = verification_dummies.add_prefix('verification_')
lis = pd.concat([lis, verification_dummies], axis=1)

lis.drop(columns=['host_verifications', 'amenities'], inplace=True)

print("Listings shape:", lis.shape)
print("Calendar shape:", cal.shape) 
print("Reviews shape:", rev.shape)

In [51]:
# Save processed datasets
lis.to_parquet('data/processed/02_listings.parquet')
cal.to_parquet('data/processed/02_calendar.parquet')
rev.to_parquet('data/processed/02_reviews.parquet')

In [52]:
# read to regular df, and maintain original Dtypes
lis2 = pd.read_parquet('data/processed/02_listings.parquet')
cal2 = pd.read_parquet('data/processed/02_calendar.parquet')
rev2 = pd.read_parquet('data/processed/02_reviews.parquet')

In [53]:
# Using PostgreSQL COPY command
# COPY table_name TO 'output.csv' WITH (FORMAT CSV, HEADER);

In [None]:
lis_df[lis_df['price_DKK'].isin(lis_df['price_DKK'].nlargest(5))]

In [None]:
lis_df['price_DKK'].nlargest(5)

In [None]:
lis.dtypes.value_counts()

In [None]:
lis.host_verifications

---

In [None]:
lis[lis.columns[-10:]]#.info()

In [None]:
lis[lis.columns[-20:]].info()

In [None]:
lis.iloc[:10, 40:]

In [None]:
lis[lis.columns[10:20]].info()

In [None]:
lis[lis.columns[10:20]]

In [None]:
df = pd.read_parquet('data/processed/03_listings.parquet')
verification_cols = [col for col in df.columns if col.startswith('verification_')]
df[verification_cols]

In [None]:
# Get verification columns and create comma-separated string
verification_cols = [col.replace('verification_', '') for col in df.columns if col.startswith('verification_')]
print("Available verification methods:", ', '.join(verification_cols))


In [None]:
lis_df['host_verifications']

---

In [None]:

# Calculate review score variance; consistency of the host's recieved reviews by guests
# review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
# lis_df['review_scores_variance'] = lis_df[review_score_cols].var(axis=1)

# Drop review score columns except rating and variance
# cols_to_drop = [col for col in lis_df.columns if 'review_scores_' in col 
#                 and col != 'review_scores_rating' 
#                 and col != 'review_scores_variance']
# lis_df.drop(columns=cols_to_drop, inplace=True)

## Amenity analysis

In [None]:
# Define key amenity categories that are most relevant for pricing/booking
IMPORTANT_AMENITIES = {
    'essentials': ['Wifi', 'Kitchen', 'Heating', 'Air conditioning', 'Washer'],
    'luxury': ['Pool', 'Hot tub', 'Gym', 'Free parking'],
    'safety': ['Smoke alarm', 'Carbon monoxide alarm', 'Fire extinguisher']
}

# Create binary columns for important amenities and category counts
for category, items in IMPORTANT_AMENITIES.items():
    # Create binary columns for each important amenity
    for item in items:
        lis[f'has_{item.lower().replace(" ", "_")}'] = lis['amenities'].str.contains(item, case=False)
    
    # Create count for each category
    lis[f'{category}_count'] = lis['amenities'].apply(
        lambda x: sum(item.lower() in x.lower() for item in items)
    )

In [None]:
# Save processed datasets
lis.to_parquet('data/processed/02_listings.parquet')
cal.to_parquet('data/processed/02_calendar.parquet')
rev.to_parquet('data/processed/02_reviews.parquet')

---

In [None]:
# read to regular df, and maintain original Dtypes
lis2 = pd.read_parquet('data/processed/02_listings.parquet')
cal2 = pd.read_parquet('data/processed/02_calendar.parquet')
rev2 = pd.read_parquet('data/processed/02_reviews.parquet')

In [None]:
lis['amenities']

In [None]:
# Get all unique amenities
all_amenities = set()
# lis['amenities'].str.split(',').apply(lambda x: [all_amenities.add(item.strip()) for item in x])
lis['amenities'].str.strip('[]').str.split(',').apply(lambda x: [all_amenities.add(item.strip()) for item in x])

# Sort and print the unique amenities to review
sorted_amenities = sorted(all_amenities)

print(f"Total unique amenities: {len(sorted_amenities)}")
print("\nAll unique amenities:")
for amenity in sorted_amenities:
    print(f"- {amenity}")

In [None]:
# Get all unique amenities
all_amenities = set()

# Clean the string representation and split
lis['amenities'].str.strip('[]').str.split(',').apply(
    lambda x: [all_amenities.add(
        # Clean each amenity string:
        item.strip().strip('"\'').strip('.- ').lower()  # Remove quotes, dashes, dots, and extra spaces
    ) for item in x]
)

# Sort and print the unique amenities to review
sorted_amenities = sorted(all_amenities)
print(f"Total unique amenities: {len(sorted_amenities)}")
print("\nAll unique amenities:")
for amenity in sorted_amenities:
    print(f"- {amenity}")

In [None]:
# lis['amenities'].str.strip('[]').str.split(',').apply(
#     lambda x: [all_amenities.add(
#         # Clean each amenity string:
#         item.strip().strip('"\'').strip('.- ').lower()  # Remove quotes, dashes, dots, and extra spaces
#     ) for item in x])

def clean_amenity(text):
    """Clean individual amenity strings"""
    import re
    text = str(text) # Convert to string if not already
    text = text.strip().strip('"\'').strip('.- ') # Basic cleaning
    text = text.encode('ascii', 'ignore').decode('ascii') # Replace unicode escape sequences with their characters
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters, keeping only alphanumeric and spaces
    text = text.lower().strip() # Convert to lowercase, strip again, and remove any remaining leading/trailing spaces
    return text

# Clean and convert amenities to comma-separated string
lis['amenities'] = lis['amenities'].str.strip('[]').str.split(',').apply(
    lambda x: ','.join(
        sorted(  # Sort for consistency
            filter(None,  # Remove empty strings
                [clean_amenity(item) for item in x]
            )
        )
    )
)

lis['amenities'] = lis['amenities'].astype('string') # Convert to string dtype

# Get all unique amenities for sanity check
all_amenities = set()
lis['amenities'].str.strip('[]').str.split(',').apply(
    lambda x: [all_amenities.add(clean_amenity(item)) for item in x if clean_amenity(item)]
)
all_amenities.discard('') # Remove empty strings if any made it through

# Sort and print the unique amenities to review
sorted_amenities = sorted(all_amenities)
print(f"Total unique amenities: {len(sorted_amenities)}")
print("\nAll unique amenities:")
for amenity in sorted_amenities:
    print(f"- {amenity}")

# # Clean and convert amenities to comma-separated string
# lis['amenities'] = lis['amenities'].str.strip('[]').str.split(',').apply(
#     lambda x: ','.join(
#         sorted(  # Sort for consistency
#             filter(None, # Remove empty strings
#                 [clean_amenity(item) for item in x]
#             )
#         )
#     )
# )


# # Display a few examples of the cleaned amenities
# print("Sample of cleaned amenities:")
# print(lis['amenities'][:10])

In [None]:
lis['amenities']

In [None]:
len(all_amenities)

In [None]:
print(lis.amenities.str.strip('[]').str.split(',').str.len().max())

In [None]:
print(cal.price.unique())#.nunique())
lis.dtypes.value_counts()

## Checking currency inconsistency in cal and lis csvs

In [None]:
cal.loc[cal['listing_id'] == 262961]

In [None]:
lis.loc[lis['id'] == 7631726, ['id', 'price_DKK']]

In [None]:
lis[['id','listing_url', 'price_DKK']][:2]
# calendar_df[calendar_df['listing_id'] == 31094]

---

In [None]:
lis = lis.where(pd.notnull(lis), None)

In [None]:
# Sanity check
# print("Calendar date range:", cal['datetime'].min(), "to", cal['datetime'].max())

In [None]:
# Get columns with less than 3 unique values
low_unique_cols = [col for col in lis.columns if lis[col].nunique() < 3]
print("Columns with less than 3 unique values:")
for col in low_unique_cols:
    print(f"{col}: {lis[col].nunique()}")

In [None]:
# print(rev.reviewer_name)#.nunique())
rev.describe()

---

# 01_init_EDA

In [7]:
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
lis_df = pd.read_parquet('data/processed/02_listings.parquet')
cal_df = pd.read_parquet('data/processed/02_calendar.parquet')
rev_df = pd.read_parquet('data/processed/02_reviews.parquet')

### 2. Initial Data Preview

In [None]:
lis_df.head()

In [None]:
cal_df.head()

In [None]:
rev_df.head()

### 3. Data Structure Analysis

In [13]:
def explore_dataset(df, name):
    print(f"\n{'='*50}")
    print(f"Dataset: {name}")
    print(f"{'='*50}")
    
    print("\n1. Basic Information:")
    print(f"Shape: {df.shape}")
    
    print("\n2. Data Types:")
    print(df.dtypes)

    print("\n3. Summary Statistics:")
    print(df.describe())
    
    print("\n4. Unique Values:")
    for col in df.columns:
        n_unique = df[col].nunique()
        if n_unique < 3:
            unique_values = df[col].unique()
            print(f"{col}: {n_unique} unique values")
            print(f"Values: {unique_values}\n")

    print("\n5. Missing Values:")
    # Todo: Elias ad unique values + here
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_pct
    })
    print(missing_info[missing_info['Missing Values'] > 0])
    
    return missing_info

In [None]:
listings_missing = explore_dataset(lis_df, 'Listings')

In [None]:
calendar_missing = explore_dataset(cal_df, 'Calendar')

In [None]:
reviews_missing = explore_dataset(rev_df, 'Reviews')

### 4. Missing Values Visualization

In [None]:
def plot_missing_values(missing_info, title):
    # Filter columns with more than 0.1% missing values
    missing_filtered = missing_info[missing_info['Percentage'] > 0.001]
    
    plt.figure(figsize=(18, 10))
    ax = missing_filtered['Percentage'].plot(kind='bar')
    plt.title(f'Missing Values in {title} Dataset')
    plt.xlabel('Columns')
    plt.ylabel('Percentage Missing')
    plt.xticks(rotation=45, ha='right')
    
    
    # Add total missing values labels on top of each bar, rotated 45 degrees
    for i, v in enumerate(missing_filtered['Missing Values']):
        ax.text(i, missing_filtered['Percentage'].iloc[i], f'{int(v):,}', 
                ha='left', va='bottom', fontsize=8, rotation=45)
    
    plt.tight_layout()
    plt.show()

# Plot missing values for each dataset
plot_missing_values(listings_missing, 'Listings')
# plot_missing_values(calendar_missing, 'Calendar')
# plot_missing_values(reviews_missing, 'Reviews')

### 5. Data Quality Assessment

In [19]:
def identify_data_quality_issues(df, name):
    print(f"\n{'='*50}")
    print(f"Data Quality Report for {name}")
    print(f"{'='*50}")
    
    # 1. Check for duplicates
    n_duplicates = df.duplicated().sum()
    if n_duplicates > 0:
        print(f"\nDuplicate rows: {n_duplicates}")
    
    # 2. Check for unexpected values
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    has_unexpected = False
    for col in numeric_cols:
        n_zeros = (df[col] == 0).sum()
        n_negative = (df[col] < 0).sum()
        if n_zeros > 0 or n_negative > 0:
            if not has_unexpected:
                print("\nColumns with unexpected values:")
                has_unexpected = True
            print(f"\n{col}:")
            if n_zeros > 0:
                print(f"- Zeros: {n_zeros} ({(n_zeros/len(df))*100:.2f}%)")
            if n_negative > 0:
                print(f"- Negative values: {n_negative} ({(n_negative/len(df))*100:.2f}%)")
    
    # 3. Check string columns for data inconsistencies
    string_cols = df.select_dtypes(include=['object']).columns
    has_inconsistencies = False
    for col in string_cols:
        n_empty = (df[col] == '').sum()
        n_whitespace = df[col].str.isspace().sum() if df[col].dtype == 'object' else 0
        if n_empty > 0 or n_whitespace > 0:
            if not has_inconsistencies:
                print("\nColumns with inconsistencies:")
                has_inconsistencies = True
            print(f"\n{col}:")
            if n_empty > 0:
                print(f"- Empty strings: {n_empty}")
            if n_whitespace > 0:
                print(f"- Whitespace only: {n_whitespace}")
    
    # 4. Check for extreme values in numeric columns
    has_outliers = False
    for col in numeric_cols:
        mean = df[col].mean()
        std = df[col].std()
        outliers = df[col][abs(df[col] - mean) > 3*std]
        if len(outliers) > 0:
            if not has_outliers:
                print("\nColumns with outliers (beyond 3 std devs):")
                has_outliers = True
            print(f"\n{col}:")
            print(f"- Number of outliers: {len(outliers)}")
            print(f"- Min outlier: {outliers.min()}")
            print(f"- Max outlier: {outliers.max()}")

In [None]:
# Check Listings dataset
identify_data_quality_issues(lis_df, 'Listings')

In [None]:
# Check Calendar dataset
identify_data_quality_issues(cal_df, 'Calendar')

# Additional calendar-specific checks
print("\nChecking calendar date patterns:")
cal_df['date'] = pd.to_datetime(cal_df['date'])
print(f"Date range: {cal_df['date'].min()} to {cal_df['date'].max()}")
print(f"Missing dates: {cal_df['date'].isnull().sum()}")
print(f"Days between min and max date: {(cal_df['date'].max() - cal_df['date'].min()).days}")

In [None]:
# Check Reviews dataset
print(identify_data_quality_issues(rev_df, 'Reviews'))

### 6. Data Format Analysis

In [23]:
def check_data_formats(df):
    """Check for inconsistent formats within columns"""
    for col in df.columns:
        # Get sample of unique values
        unique_samples = df[col].dropna().unique()[:2]
        print(f"\n{col}:")
        for sample in unique_samples:
            print(f"Value: {sample}, Type: {type(sample)}")


In [None]:
check_data_formats(lis_df)

In [None]:
check_data_formats(cal_df)

In [None]:
check_data_formats(rev_df)

### 7. Special Characters Examination

In [27]:
def check_special_characters(df):
    """Check for special characters that might need handling"""
    string_cols = df.select_dtypes(include=['object']).columns
    for col in string_cols:
        # Fixed: Properly chain the .any() method
        if df[col].astype(str).str.contains(r'[^a-zA-Z0-9\s\-.,:/+&æøåÆØÅ]').any():
            print(f"\n{col} contains special characters")
            # Show examples of rows containing special characters
            print(df[df[col].astype(str).str.contains(r'[^a-zA-Z0-9\s\-.,:/+&æøåÆØÅ]')][col].head())

In [None]:
check_special_characters(lis_df)


In [29]:
check_special_characters(cal_df)

In [None]:
check_special_characters(rev_df)