# Advanced Data Analysis

In [None]:
import pandas as pd
lis_df = pd.read_parquet('data/processed/05_listings.parquet') 
rev_df = pd.read_parquet('data/processed/05_sentiment_bert.parquet')
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

In [None]:
print(lis_df.info())
lis_df[:3]

In [None]:

print(rev_df.info())
rev_df[:3]

---

In [130]:
# Aggregate reviews before merging
reviews_agg = rev_df.groupby('listing_id').agg({
    'date': lambda x: (x.max() - x.min()).days / 365.25,  # Get range of dates
    'sentiment_score_1_5': 'mean',  # Average sentiment and count
    'reviewer_id': 'nunique' 
}).rename(columns={
    'date': 'active_period_years',
    'sentiment_score_1_5': 'avg_sentiment',
    'reviewer_id': 'n_bookings'
}).reset_index()

# Merge with listings (now one-to-one)
lis_1t1 = lis_df.merge(
    reviews_agg,
    left_on='id',
    right_on='listing_id',
    how='left'
)

### smth

In [None]:
lis_1t1[['id','review_scores_rating','avg_sentiment']]

In [None]:
# lis_1t1.drop(columns=[ 'host_identity_verified', 'instant_bookable'], inplace=True)
# lis_1t1.info()

In [None]:
# 'host_has_profile_pic', 'description'

In [None]:
#'host_acceptance_rate_pct',

In [None]:
lis_1t1.drop(columns=['name','description','host_response_rate_pct','neighbourhood_cleansed',  'has_availability', 'listing_id', 'host_since'], inplace=True) 

lis_1t1.drop(columns=['price_vs_room_type', 'price_per_person', 'price_per_bedroom', 'neighborhood_avg_price','price_vs_neighborhood','room_type_avg_price', 'host_listings_ratio' ], inplace=True)
lis_1t1.info()

In [None]:
lis_df['host_identity_verified']

In [None]:
reviews = rev_df.groupby('listing_id')[['date', 'sentiment_score_1_5', 'comments', 'reviewer_id']].reset_index()

# Merge with listings
lis_df = lis_df.merge(
    reviews,
    left_on='id',
    right_on='listing_id',
    how='left'
)

In [None]:
rev_df[rev_df['listing_id'] == 1188302911099164911]['comments']

In [None]:
# Count missing avg_sentiment values
missing_sentiment = lis_1t1['avg_sentiment'].isna().sum()
total_rows = len(lis_1t1)
print(f"Missing sentiment scores: {missing_sentiment} out of {total_rows} rows ({missing_sentiment/total_rows:.1%})")

In [None]:
# Count missing review_scores_rating values
missing_ratings = lis_1t1['review_scores_rating'].isna().sum()
total_rows = len(lis_1t1)
print(f"Missing review scores: {missing_ratings} out of {total_rows} rows ({missing_ratings/total_rows:.1%})")


In [None]:
review_columns = [col for col in lis_df.columns if 'review' in col.lower()]
print("Columns containing 'review':")
for col in review_columns:
    print(f"- {col}")


In [None]:
#TODO in SQL
# Response Quality: encode ordinal rating of each possible response time category
# lis_df['host_response_time']

# How many listings a host own (listings_count)

In [None]:
def analyze_missing_values(df, df_name):
    # Calculate missing values
    missing = df.isnull().sum()
    missing_percent = (df.isnull().sum() / len(df)) * 100
    
    # Create a summary DataFrame
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Missing Percentage': missing_percent.round(2)
    })
    
    # Only show columns with missing values, sorted by percentage
    missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values(
        'Missing Percentage', ascending=False
    )
    
    print(f"\nMissing Values Analysis for {df_name}:")
    print("-" * 50)
    if len(missing_info) > 0:
        print(missing_info)
    else:
        print("No missing values found!")
    print(f"\nTotal rows in dataset: {len(df)}")

# Analyze both datasets
analyze_missing_values(lis_df, "Listings")
analyze_missing_values(rev_df, "Reviews")

In [None]:
# Check missing values in reviews.comments
print("\nMissing values in reviews.beds:")
print(f"Number of missing beds: {lis_df['beds'].isna().sum()}")
print(f"Percentage missing: {(lis_df['beds'].isna().sum() / len(lis_df) * 100):.2f}%")

# Show sample of reviews with missing comments
print("\nSample of reviews with missing beds:")
display(lis_df[lis_df['beds'].isna()].head())
# Display beds and bathrooms for rows where beds are missing
print("\nBeds and bathrooms for listings with missing beds:")
display(lis_df[lis_df['beds'].isna()][['beds', 'bathrooms', 'bathrooms_text']])



In [None]:
lis_df[lis_df.columns[lis_df.columns.str.contains('reviews_')]]

In [None]:
# Handle half baths and NAs in bathrooms_text
# First convert to float to avoid dtype warning
half_bath_mask = lis_df['bathrooms_text'].isin(['Half-bath', 'Shared half-bath', 'Private half-bath'])
lis_df.loc[half_bath_mask, 'bathrooms_text'] = lis_df.loc[half_bath_mask, 'bathrooms_text'].astype(float).fillna(0.5)

# Handle NAs with explicit float conversion
lis_df['bathrooms_text'] = lis_df['bathrooms_text'].astype(float).fillna(0.0)

# Extract numeric values from bathrooms_text
# Convert to string first before using str accessor
lis_df['bathrooms_text'] = lis_df['bathrooms_text'].astype(str).str.extract(r'^(\d+\.?\d?)').astype(float)

# Fill NaN bathrooms values with bathrooms_text values
lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms'] = lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms_text']
print(len(lis_df['bathrooms_text'].unique()), lis_df['bathrooms_text'].unique())

In [None]:
lis_df['bathrooms_text'].unique()
# Aggregate reviews before merging
lis_df[['bathrooms', 'bathrooms_text']].dtypes

# Show sample where bathrooms and bathrooms_text values differ and get total count
diff_bath = lis_df[['bathrooms', 'bathrooms_text']].loc[lis_df['bathrooms'] != lis_df['bathrooms_text']]
print(f"Total rows with different values: {len(diff_bath)}")
print("\nSample of differences:")
display(diff_bath[:5])

In [None]:
# Fill NaN bathrooms values with bathrooms_text values
lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms'] = lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms_text']

___

In [2]:
import pandas as pd

In [None]:
rev_df = pd.read_parquet('data/processed/03_reviews.parquet')
rev_s = pd.read_parquet('data/processed/04_sentiment_bert.parquet')

In [None]:
rev_df.sort_values('id')[:2]

In [None]:
rev_s.sort_values('id')[:2]

In [None]:

len(rev_s['sentiment_score_1_5'].unique())

___

In [None]:
# Find all columns containing 'host_listings' or matching 'host_total_listings_count'
host_listing_cols = [col for col in lis_df.columns if 'host_listings' in col or col == 'host_total_listings_count']
print("Columns containing 'host_listings' or 'host_total_listings_count':")
print(host_listing_cols)

In [None]:
lis_df[host_listing_cols][:5]

In [None]:
# Compare the three different host listing count columns
listing_counts = lis_df[['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count']]

# Find cases where they are all equal
all_equal = (listing_counts['host_listings_count'] == listing_counts['host_total_listings_count']) & \
            (listing_counts['host_total_listings_count'] == listing_counts['calculated_host_listings_count'])

print("Cases where all counts are equal:", sum(all_equal))
print("\nCases where counts differ:")
print(listing_counts[~all_equal].head())
print(f"\nTotal cases where counts differ: {sum(~all_equal)}")

# Check if host_total_listings_count is always the largest
is_largest = (listing_counts['host_total_listings_count'] >= listing_counts['host_listings_count']) & \
             (listing_counts['host_total_listings_count'] >= listing_counts['calculated_host_listings_count'])
print(f"\nIs host_total_listings_count always the largest? {is_largest.all()}")

if not is_largest.all():
    print("\nCases where host_total_listings_count is not the largest:")
    print(listing_counts[~is_largest].head())


___

___

Various types of analyses to understand the data better and to prepare for the database design.

In [None]:
price_cols = [col for col in lis_df.columns if col.startswith('yearly')]
print("Price-related columns:", price_cols)

In [None]:
lis_df['days_since_last_review']

In [None]:
review_cols = ['total_reviews', 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_l30d', 'yearly_review', 'number_of_reviews_ltm']
print("Review-related columns from listings:")
print(lis_df[review_cols].head())