# Advanced Data Analysis

In [2]:
import pandas as pd
lis_df = pd.read_parquet('data/processed/03_listings.parquet') 
rev_df = pd.read_parquet('data/processed/03_reviews.parquet')
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

Listings shape: (20905, 43)
Reviews shape: (366636, 5)


In [12]:
rev_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,comments
0,31094,79346,2010-08-16,171607,"We had a great stay. Conveniently located, qui..."
1,31094,166275,2011-01-05,306860,It was a very good stay. The appartment was re...
2,31094,1452299,2012-06-10,1321058,Really enjoyed my time at Ebbe's place. It is...
3,31094,6766430,2013-08-24,2182771,"The apartment was very well located, 10-15 min..."
4,31094,6827217,2013-08-26,8025926,"This is a great flat, very clean with everythi..."


In [4]:
lis_df[['neighbourhood','host_neighbourhood', 'neighbourhood_cleansed','host_location']][:20]
# lis_df['neighbourhood'].unique()
# lis_df['host_neighbourhood'].unique()
# lis_df['neighbourhood_cleansed'].unique()
# 

Unnamed: 0,neighbourhood,host_neighbourhood,neighbourhood_cleansed,host_location
0,"Copenhagen, Capital Region of Denmark, Denmark",Vesterbro,Vesterbro-Kongens Enghave,"Copenhagen, Denmark"
1,"Copenhagen, Capital Region of Denmark, Denmark",Nørrebro,Nrrebro,"Copenhagen, Denmark"
2,Unknown,Nørrebro,Nrrebro,"Copenhagen, Denmark"
3,"Copenhagen, V, Denmark",Vesterbro,Vesterbro-Kongens Enghave,"Copenhagen, Denmark"
4,Unknown,Vesterbro,Vesterbro-Kongens Enghave,"Copenhagen, Denmark"
5,Unknown,Amagerbro,Amager st,"Copenhagen, Denmark"
6,Unknown,Østerbro,sterbro,"Copenhagen, Denmark"
7,"Frederiksberg, Capital Region of Denmark, Denmark",Vesterbro,Vesterbro-Kongens Enghave,"Copenhagen, Denmark"
8,"Copenhagen, Capital Region of Denmark, Denmark",Nørrebro,Nrrebro,"Copenhagen, Denmark"
9,"Copenhagen, Denmark",Indre By,Indre By,"Copenhagen, Denmark"


In [9]:
lis_df['host_neighbourhood'].unique()

<StringArray>
[           'Vesterbro',             'Nørrebro',            'Amagerbro',
             'Østerbro',             'Indre By',         'København NV',
       'Islands Brygge',                'Valby',        'Frederiksberg',
          'Amager Vest',       'Christianshavn',          'Sluseholmen',
           'Amager Øst',                     '',              'Vanløse',
             'Brønshøj',              'Nyboder',              'Niagara',
           'Teglholmen',              'Hackney',              'Kødbyen',
   '6th Arrondissement',            'Södermalm',               'Holmen',
          'Embajadores',          'Batignolles',           'Montmartre',
               'Dragør',            'Vesturbær',              'Kastrup',
                'Passy',              'Almagro',         'Clinton Hill',
        'Jakkur Layout', 'Jardim das Bandeiras',         'Williamsburg',
         'Notting Hill',                'Belém',           'Campo Belo',
          'El Madroñal',        'Alph

In [11]:
non_cph_neighborhoods = ['', 'Nyboder', 'Niagara','6th Arrondissement', 'Södermalm','Embajadores', 'Batignolles', 'Montmartre', 'Vesturbær', 'Passy', 'Almagro','Clinton Hill', 'Jakkur Layout', 'Jardim das Bandeiras', 'Williamsburg', 'Notting Hill', 'Belém', 'Campo Belo', 'El Madroñal', 'Alphabet City', 'Buzovna', 'Bastille']

# Count listings for each non-Copenhagen neighborhood
non_cph_counts = lis_df[lis_df['host_neighbourhood'].isin(non_cph_neighborhoods)]['host_neighbourhood'].value_counts()

print("Number of listings in each non-Copenhagen neighborhood:")
print(non_cph_counts)

print("\nTotal number of listings with non-Copenhagen neighborhoods:", 
      len(lis_df[lis_df['host_neighbourhood'].isin(non_cph_neighborhoods)]))

Number of listings in each non-Copenhagen neighborhood:
host_neighbourhood
                        15489
Nyboder                     9
Jakkur Layout               8
El Madroñal                 6
Vesturbær                   2
Jardim das Bandeiras        2
Passy                       1
Niagara                     1
6th Arrondissement          1
Södermalm                   1
Embajadores                 1
Batignolles                 1
Montmartre                  1
Williamsburg                1
Almagro                     1
Clinton Hill                1
Notting Hill                1
Belém                       1
Campo Belo                  1
Alphabet City               1
Buzovna                     1
Bastille                    1
Name: count, dtype: Int64

Total number of listings with non-Copenhagen neighborhoods: 15532


In [8]:
# Check missing values in host_neighbourhood
print("Number of missing values in host_neighbourhood:", lis_df['host_neighbourhood'].isna().sum())
print("\nPercentage of missing values:", (lis_df['host_neighbourhood'].isna().sum() / len(lis_df) * 100).round(2), "%")

# Compare with neighbourhood_cleansed values where host_neighbourhood is missing
print("\nSample of rows with missing host_neighbourhood:")
print(lis_df[lis_df['host_neighbourhood'].isna()][['neighbourhood_cleansed', 'host_neighbourhood', 'neighbourhood']][:5])


Number of missing values in host_neighbourhood: 0

Percentage of missing values: 0.0 %

Sample of rows with missing host_neighbourhood:
Empty DataFrame
Columns: [neighbourhood_cleansed, host_neighbourhood, neighbourhood]
Index: []


In [None]:
lis_df['neighbourhood_cleansed'].unique()

In [None]:
lis_df.info()

In [None]:
lis_df['calendar_last_scraped'][:2]

In [None]:
lis_df.drop(['last_scraped', 'source', 'host_response_rate_pct', ], axis=1, inplace=True)

___

In [None]:
# Find all columns containing 'host_listings' or matching 'host_total_listings_count'
host_listing_cols = [col for col in lis_df.columns if 'host_listings' in col or col == 'host_total_listings_count']
print("Columns containing 'host_listings' or 'host_total_listings_count':")
print(host_listing_cols)

In [None]:
lis_df[host_listing_cols][:5]

In [None]:
# Compare the three different host listing count columns
listing_counts = lis_df[['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count']]

# Find cases where they are all equal
all_equal = (listing_counts['host_listings_count'] == listing_counts['host_total_listings_count']) & \
            (listing_counts['host_total_listings_count'] == listing_counts['calculated_host_listings_count'])

print("Cases where all counts are equal:", sum(all_equal))
print("\nCases where counts differ:")
print(listing_counts[~all_equal].head())
print(f"\nTotal cases where counts differ: {sum(~all_equal)}")

# Check if host_total_listings_count is always the largest
is_largest = (listing_counts['host_total_listings_count'] >= listing_counts['host_listings_count']) & \
             (listing_counts['host_total_listings_count'] >= listing_counts['calculated_host_listings_count'])
print(f"\nIs host_total_listings_count always the largest? {is_largest.all()}")

if not is_largest.all():
    print("\nCases where host_total_listings_count is not the largest:")
    print(listing_counts[~is_largest].head())


___

In [None]:
# Verify host_response_rate_pct is always 0 when host_response_time is empty
empty_response_time = lis_df[lis_df['host_response_time'] == '']
print("\nHost response rates when response time is empty:")
print(empty_response_time['host_response_rate_pct'].value_counts())
print(f"\nAll zeros? {(empty_response_time['host_response_rate_pct'] == 0).all()}")

# Remap empty strings to 'never' where host_response_rate_pct is 0
lis_df.loc[(lis_df['host_response_time'] == '') & (lis_df['host_response_rate_pct'] == 0), 'host_response_time'] = 'never'

lis_df['host_response_time'].unique()

Various types of analyses to understand the data better and to prepare for the database design.

In [None]:
price_cols = [col for col in lis_df.columns if col.startswith('yearly')]
print("Price-related columns:", price_cols)

In [None]:
lis_df['days_since_last_review']

In [None]:
review_cols = ['total_reviews', 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_l30d', 'yearly_review', 'number_of_reviews_ltm']
print("Review-related columns from listings:")
print(lis_df[review_cols].head())