# Advanced Data Analysis

In [22]:
import pandas as pd
lis_df = pd.read_parquet('data/processed/03_listings.parquet') 
rev_df = pd.read_parquet('data/processed/03_reviews.parquet')
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

Listings shape: (20905, 81)
Reviews shape: (366636, 5)


In [30]:
# Find all columns containing 'host_listings' or matching 'host_total_listings_count'
host_listing_cols = [col for col in lis_df.columns if 'host_listings' in col or col == 'host_total_listings_count']
print("Columns containing 'host_listings' or 'host_total_listings_count':")
print(host_listing_cols)

Columns containing 'host_listings' or 'host_total_listings_count':
['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'host_listings_ratio']


In [31]:
lis_df[host_listing_cols][:5]

Unnamed: 0,host_listings_count,host_total_listings_count,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,host_listings_ratio
0,1.0,1.0,1,1,0,0,0.000279
1,1.0,3.0,1,1,0,0,0.000803
2,1.0,1.0,1,1,0,0,0.000268
3,3.0,4.0,2,1,1,0,0.001115
4,1.0,1.0,1,1,0,0,0.000279


In [19]:
# Compare the three different host listing count columns
listing_counts = lis_df[['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count']]

# Find cases where they are all equal
all_equal = (listing_counts['host_listings_count'] == listing_counts['host_total_listings_count']) & \
            (listing_counts['host_total_listings_count'] == listing_counts['calculated_host_listings_count'])

print("Cases where all counts are equal:", sum(all_equal))
print("\nCases where counts differ:")
print(listing_counts[~all_equal].head())
print(f"\nTotal cases where counts differ: {sum(~all_equal)}")

# Check if host_total_listings_count is always the largest
is_largest = (listing_counts['host_total_listings_count'] >= listing_counts['host_listings_count']) & \
             (listing_counts['host_total_listings_count'] >= listing_counts['calculated_host_listings_count'])
print(f"\nIs host_total_listings_count always the largest? {is_largest.all()}")

if not is_largest.all():
    print("\nCases where host_total_listings_count is not the largest:")
    print(listing_counts[~is_largest].head())


Cases where all counts are equal: 13029

Cases where counts differ:
   host_listings_count  host_total_listings_count  \
1                  1.0                        3.0   
3                  3.0                        4.0   
5                  4.0                        8.0   
8                  1.0                        2.0   
9                  2.0                        5.0   

   calculated_host_listings_count  
1                               1  
3                               2  
5                               4  
8                               1  
9                               1  

Total cases where counts differ: 7876

Is host_total_listings_count always the largest? True


___

In [20]:
lis_df[['total_reviews',   'number_of_reviews']][:2]

Unnamed: 0,total_reviews,number_of_reviews
0,19,19
1,36,36
2,21,21
3,82,82
4,3,3
...,...,...
20900,0,0
20901,0,0
20902,0,0
20903,0,0


In [18]:
lis_df[['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count']][:2]

Unnamed: 0,host_listings_count,host_total_listings_count,calculated_host_listings_count
0,1.0,1.0,1
1,1.0,3.0,1
2,1.0,1.0,1
3,3.0,4.0,2
4,1.0,1.0,1
...,...,...,...
20900,1.0,1.0,1
20901,1.0,1.0,1
20902,1.0,5.0,1
20903,1.0,4.0,1


In [None]:
# Verify host_response_rate_pct is always 0 when host_response_time is empty
empty_response_time = lis_df[lis_df['host_response_time'] == '']
print("\nHost response rates when response time is empty:")
print(empty_response_time['host_response_rate_pct'].value_counts())
print(f"\nAll zeros? {(empty_response_time['host_response_rate_pct'] == 0).all()}")

# Remap empty strings to 'never' where host_response_rate_pct is 0
lis_df.loc[(lis_df['host_response_time'] == '') & (lis_df['host_response_rate_pct'] == 0), 'host_response_time'] = 'never'

lis_df['host_response_time'].unique()


Host response rates when response time is empty:
host_response_rate_pct
0.0    6466
Name: count, dtype: int64

All zeros? True


Various types of analyses to understand the data better and to prepare for the database design.

In [32]:
lis_df[['host_experience_years', 'host_since']]

Unnamed: 0,host_experience_years,host_since
0,12.016427,2010-05-22
1,11.616701,2011-11-07
2,11.583847,2011-10-01
3,13.716632,2010-06-07
4,5.169062,2011-11-08
...,...,...
20900,0.000000,2022-06-28
20901,0.000000,2013-08-13
20902,0.000000,2012-05-08
20903,0.000000,2020-01-20


In [None]:
price_cols = [col for col in lis_df.columns if col.startswith('yearly')]
print("Price-related columns:", price_cols)

In [None]:
lis_df['days_since_last_review']

In [None]:
review_cols = ['total_reviews', 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_l30d', 'yearly_review', 'number_of_reviews_ltm']
print("Review-related columns from listings:")
print(lis_df[review_cols].head())