# Advanced Data Analysis

In [None]:
import pandas as pd
lis_df = pd.read_parquet('data/processed/03_listings.parquet') 
rev_df = pd.read_parquet('data/processed/03_reviews.parquet')
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

In [None]:
rev_df.head()

In [None]:
lis_df[['neighbourhood','host_neighbourhood', 'neighbourhood_cleansed','host_location']][:20]
# lis_df['neighbourhood'].unique()
# lis_df['host_neighbourhood'].unique()
# lis_df['neighbourhood_cleansed'].unique()
# 

In [64]:
# Drop neighbourhood and host_location columns
lis_df = lis_df.drop(['neighbourhood', 'host_location'], axis=1)


In [None]:
lis_df['neighbourhood_cleansed'].unique()

In [None]:
lis_df['host_neighbourhood'].unique()

In [None]:
# Count mismatches between host_neighbourhood and neighbourhood_cleansed
mismatch_count = len(lis_df[lis_df['host_neighbourhood'] != lis_df['neighbourhood_cleansed']])
print(f"Number of rows where host_neighbourhood differs from neighbourhood_cleansed: {mismatch_count}")

# Show sample of mismatched rows
print("\nSample of mismatched rows:")
print(lis_df[lis_df['host_neighbourhood'] != lis_df['neighbourhood_cleansed']]
      [['host_neighbourhood', 'neighbourhood_cleansed']]
      .head()
      .to_string())

In [None]:
# Show all neighbourhood columns for listings where host_neighbourhood is 'Hackney'
print("Listings where host_neighbourhood is 'Hackney':")
print(lis_df[lis_df['host_neighbourhood'] == 'Hackney']
      [['host_neighbourhood', 'neighbourhood', 'neighbourhood_cleansed','host_location']]
      .to_string())
      

In [None]:
# Fix Danish character encodings in neighbourhood_cleansed
mapping = {
    'Nrrebro': 'Nørrebro',
    'sterbro': 'Østerbro', 
    'Vanlse': 'Vanløse',
    'Brnshj-Husum': 'Brønshøj',
    'Vesterbro-Kongens Enghave': 'Vesterbro',
}

# Update Vesterbro-Kongens Enghave to Vesterbro
# lis_df['host_neighbourhood'] = lis_df['host_neighbourhood'].replace('Vesterbro-Kongens Enghave', 'Vesterbro')

lis_df['neighbourhood_cleansed'] = lis_df['neighbourhood_cleansed'].replace(mapping)
lis_df['neighbourhood_cleansed'].unique()


non_cph_neighborhoods = ['', 'Nyboder', 'Niagara','6th Arrondissement', 'Södermalm','Embajadores', 'Batignolles', 'Montmartre', 'Vesturbær', 'Passy', 'Almagro','Clinton Hill', 'Jakkur Layout', 'Jardim das Bandeiras', 'Williamsburg', 'Notting Hill', 'Belém', 'Campo Belo', 'El Madroñal', 'Alphabet City', 'Buzovna', 'Bastille']

# Count listings for each non-Copenhagen neighborhood
non_cph_counts = lis_df[lis_df['host_neighbourhood'].isin(non_cph_neighborhoods)]['host_neighbourhood'].value_counts()

print("Number of listings in each non-Copenhagen neighborhood:")
print(non_cph_counts)

print("\nTotal number of listings with non-Copenhagen neighborhoods:", 
      len(lis_df[lis_df['host_neighbourhood'].isin(non_cph_neighborhoods)]))


# Remap Hackney to Nørrebro
lis_df.loc[lis_df['host_neighbourhood'] == 'Hackney', 'host_neighbourhood'] = 'Nørrebro'

# Replace empty host_neighbourhood values with corresponding neighbourhood_cleansed values
lis_df.loc[lis_df['host_neighbourhood'] == '', 'host_neighbourhood'] = lis_df.loc[lis_df['host_neighbourhood'] == '', 'neighbourhood_cleansed']

# Get neighborhoods with count < 9 from non_cph_counts
small_non_cph = non_cph_counts[non_cph_counts < 9].index

# Show the rows before changes
print("Before changes:")
print(lis_df[lis_df['host_neighbourhood'].isin(small_non_cph)]
      [['host_neighbourhood', 'neighbourhood', 'neighbourhood_cleansed']]
      .to_string())

# Update host_neighbourhood to match neighbourhood_cleansed where host_neighbourhood is in small_non_cph
mask = lis_df['host_neighbourhood'].isin(small_non_cph)
lis_df.loc[mask, 'host_neighbourhood'] = lis_df.loc[mask, 'neighbourhood_cleansed']

print("\nAfter changes:")
print(lis_df[lis_df['host_neighbourhood'].isin(small_non_cph)]
      [['host_neighbourhood', 'neighbourhood', 'neighbourhood_cleansed']]
      .to_string())


In [None]:
# Check missing values in host_neighbourhood
print("Number of missing values in host_neighbourhood:", lis_df['host_neighbourhood'].isna().sum())
print("\nPercentage of missing values:", (lis_df['host_neighbourhood'].isna().sum() / len(lis_df) * 100).round(2), "%")

# Compare with neighbourhood_cleansed values where host_neighbourhood is missing
print("\nSample of rows with missing host_neighbourhood:")
print(lis_df[lis_df['host_neighbourhood'].isna()][['neighbourhood_cleansed', 'host_neighbourhood', 'neighbourhood']][:5])


___

In [None]:
lis_df.info()

In [None]:
lis_df['calendar_last_scraped'][:2]

In [None]:
lis_df.drop(['last_scraped', 'source', 'host_response_rate_pct', ], axis=1, inplace=True)

___

In [None]:
# Find all columns containing 'host_listings' or matching 'host_total_listings_count'
host_listing_cols = [col for col in lis_df.columns if 'host_listings' in col or col == 'host_total_listings_count']
print("Columns containing 'host_listings' or 'host_total_listings_count':")
print(host_listing_cols)

In [None]:
lis_df[host_listing_cols][:5]

In [None]:
# Compare the three different host listing count columns
listing_counts = lis_df[['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count']]

# Find cases where they are all equal
all_equal = (listing_counts['host_listings_count'] == listing_counts['host_total_listings_count']) & \
            (listing_counts['host_total_listings_count'] == listing_counts['calculated_host_listings_count'])

print("Cases where all counts are equal:", sum(all_equal))
print("\nCases where counts differ:")
print(listing_counts[~all_equal].head())
print(f"\nTotal cases where counts differ: {sum(~all_equal)}")

# Check if host_total_listings_count is always the largest
is_largest = (listing_counts['host_total_listings_count'] >= listing_counts['host_listings_count']) & \
             (listing_counts['host_total_listings_count'] >= listing_counts['calculated_host_listings_count'])
print(f"\nIs host_total_listings_count always the largest? {is_largest.all()}")

if not is_largest.all():
    print("\nCases where host_total_listings_count is not the largest:")
    print(listing_counts[~is_largest].head())


___

In [None]:
# Verify host_response_rate_pct is always 0 when host_response_time is empty
empty_response_time = lis_df[lis_df['host_response_time'] == '']
print("\nHost response rates when response time is empty:")
print(empty_response_time['host_response_rate_pct'].value_counts())
print(f"\nAll zeros? {(empty_response_time['host_response_rate_pct'] == 0).all()}")

# Remap empty strings to 'never' where host_response_rate_pct is 0
lis_df.loc[(lis_df['host_response_time'] == '') & (lis_df['host_response_rate_pct'] == 0), 'host_response_time'] = 'never'

lis_df['host_response_time'].unique()

Various types of analyses to understand the data better and to prepare for the database design.

In [None]:
price_cols = [col for col in lis_df.columns if col.startswith('yearly')]
print("Price-related columns:", price_cols)

In [None]:
lis_df['days_since_last_review']

In [None]:
review_cols = ['total_reviews', 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_l30d', 'yearly_review', 'number_of_reviews_ltm']
print("Review-related columns from listings:")
print(lis_df[review_cols].head())