# Advanced Data Analysis

In [None]:
import pandas as pd
lis_df = pd.read_parquet('data/processed/05_listings.parquet') 
# rev_df = pd.read_parquet('data/processed/05_sentiment_bert.parquet')
print("Listings shape:", lis_df.shape)
# print("Reviews shape:", rev_df.shape)

In [4]:
import pandas as pd
lis_df = pd.read_parquet('data/processed/05_listings.parquet') 
lis_df['id'] = lis_df['id'].astype('category')
lis_df['host_id'] = lis_df['host_id'].astype('category')
lis_df['property_type'] = lis_df['property_type'].astype('category')
lis_df['room_type'] = lis_df['room_type'].astype('category')
lis_df['host_neighbourhood'] = lis_df['host_neighbourhood'].astype('category')

lis_df['host_response_time'] = pd.Categorical(
    lis_df['host_response_time'],
    categories=['within an hour', 'within a few hours', 'within a day', 'a few days or more', 'never'],
    ordered=True
)
lis_df = lis_df.drop(['host_acceptance_rate_pct'], axis=1)

lis_df.to_parquet('data/processed/06_listings.parquet')

In [7]:
lis_df.to_parquet('data/processed/06_listings.parquet')

In [None]:
print(lis_df.info())
lis_df[:3]
lis_df.host_response_time

In [None]:
# Convert id column to string to avoid showing numeric stats
# Convert id column to string and ensure it's treated as categorical
lis_df['id'] = lis_df['id'].astype('category')
lis_df['host_id'] = lis_df['host_id'].astype('category')
lis_df['property_type'] = lis_df['property_type'].astype('category')
lis_df['room_type'] = lis_df['room_type'].astype('category')
lis_df['host_neighbourhood'] = lis_df['host_neighbourhood'].astype('category')

lis_df['host_response_time'] = pd.Categorical(
    lis_df['host_response_time'],
    categories=['within an hour', 'within a few hours', 'within a day', 'a few days or more', 'never'],
    ordered=True
)

def create_summary_table(df):
    """Create a summary table with statistics for each column."""
    # Initialize empty lists to store column statistics
    stats = []
    
    for col in df.columns:
        # Get number of unique values
        n_unique = df[col].nunique()
        
        # Get number of null values
        n_null = df[col].isnull().sum()
        
        # Get min and max values (if applicable)
        try:
            min_val = df[col].min()
            max_val = df[col].max()
        except:
            min_val = '-'
            max_val = '-'
            
        # Get mean (if applicable)
        try:
            mean_val = df[col].mean()
        except:
            mean_val = '-'
            
        stats.append({
            'Name': col,
            'No. of Unique': n_unique,
            'No. of Null': n_null,
            'Min. Value': min_val,
            'Max. Value': max_val,
            'Mean': mean_val
        })
    
    # Create DataFrame from stats
    summary_df = pd.DataFrame(stats)
    
    # Format numeric values
    summary_df['Mean'] = pd.to_numeric(summary_df['Mean'], errors='ignore')
    numeric_cols = summary_df.select_dtypes(include=['float64', 'int64']).columns
    summary_df[numeric_cols] = summary_df[numeric_cols].round(6)
    
    return summary_df

# Create and display summary table
summary_table = create_summary_table(lis_df)
print("Table 1: Summary of Dataset Columns (Excluding Description)")

display(summary_table)


In [None]:
column_descriptions = {
    'id': 'Unique identifier for each listing',
    'description': 'Free-text description provided by the host',
    'host_id': 'Unique identifier for each host',
    'host_response_time': 'How quickly the host typically responds to guests',
    'host_neighbourhood': 'Area where the hosting property is located',
    'host_total_listings_count': 'Total number of properties listed by the host',
    'host_has_profile_pic': 'Boolean (T/F); do host have a profile picture',
    'host_identity_verified': 'Boolean (T/F) indicating if host\'s identity is verified',
    'latitude': 'Geographic latitude of the property',
    'longitude': 'Geographic longitude of the property',
    'property_type': 'Type of property (e.g., Apartment, House)',
    'room_type': 'Type of room arrangement offered',
    'accommodates': 'Maximum guests that can be accommodated',
    'bathrooms': 'Number of bathrooms available',
    'bedrooms': 'Number of bedrooms available',
    'beds': 'Total number of beds available',
    'price_DKK': 'Nightly price in Danish Krone',
    'number_of_reviews': 'Total number of reviews received',
    'review_scores_rating': 'Overall rating from reviews (1-5)',
    'review_scores_accuracy': 'Rating for listing accuracy (1-5)',
    'review_scores_cleanliness': 'Rating for cleanliness (1-5)',
    'review_scores_checkin': 'Rating for check-in experience (1-5)',
    'review_scores_communication': 'Rating for host communication (1-5)',
    'review_scores_location': 'Rating for location (1-5)',
    'review_scores_value': 'Rating for value for money (1-5)',
    'instant_bookable': 'Boolean (T/F); is instant booking available',
    'reviews_per_month': 'Average number of reviews received per month',
    'superhost': 'Boolean (T/F); is the host a Superhost',
    'host_verifications_count': 'Number of verifications completed by the host',
    'listings_in_neighborhood': 'Total number of listings in the same neighborhood',
    'host_experience_years': 'Number of years the host has been active',
    'yearly_review': 'Average reviews a listing recieves per year',
    'active_period_years': 'Number of years the listing has been active',
    'avg_sentiment': 'Average sentiment score from review analysis',
    'n_bookings': 'Total number of bookings received',
    'amenity_category': 'Ordinally level of amenities (low/medium/high)',
    'location_category': 'Distance to center ordinal (4 categories)'
}

desc_df = pd.DataFrame(list(column_descriptions.items()), columns=['Name', 'Description'])
desc_df.index = range(len(desc_df))
desc_df.index.name = '#'
display(desc_df.style.set_properties(**{'text-align': 'left'}, subset=['Name', 'Description']))

In [None]:
import pandas as pd
sentiment_data = {
    'Metric': ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'],
    'Value': [366636, 0.581580, 0.257366, -0.972317, 0.351762, 0.639770, 0.786318, 0.995148]
}


df = pd.DataFrame(sentiment_data)
print("  Sentiment Distribution ")

display(df)

In [None]:
from rich.console import Console
from rich.table import Table
import pandas as pd

def create_pretty_stats(df):
    console = Console()
    table = Table(show_header=True, header_style="bold")
    
    # Add columns
    table.add_column("#", style="dim")
    table.add_column("Name")
    table.add_column("No. of Unique", justify="right")
    table.add_column("No. of Null", justify="right")
    table.add_column("Min. Value", justify="right")
    table.add_column("Max. Value", justify="right")
    table.add_column("Mean", justify="right")
    
    for i, col in enumerate(df.columns):
        stats = {
            'unique': df[col].nunique(),
            'null': df[col].isnull().sum(),
            'min': f"{df[col].min():.6f}" if pd.api.types.is_numeric_dtype(df[col]) and not pd.isna(df[col].min()) else '-',
            'max': f"{df[col].max():.6f}" if pd.api.types.is_numeric_dtype(df[col]) and not pd.isna(df[col].max()) else '-',
            'mean': f"{df[col].mean():.6f}" if pd.api.types.is_numeric_dtype(df[col]) and not pd.isna(df[col].mean()) else '-'
        }
        
        table.add_row(
            str(i),
            col,
            str(stats['unique']),
            str(stats['null']),
            str(stats['min']),
            str(stats['max']),
            str(stats['mean'])
        )
    
    console.print(table)
# # Create DataFrame
df = pd.DataFrame(lis_df)
create_summary_stats(df)
# # Print table
# print(tabulate(df, headers='keys', tablefmt='simple', showindex=True))

In [7]:
lis_df = lis_df.drop(['active_period_years', 'host_acceptance_rate_pct', 'listing_url'], axis=1)

In [None]:
lis_df['location_category']

In [None]:

print(rev_df.info())
rev_df[:3]

---

In [4]:
# Aggregate reviews before merging
reviews_agg = rev_df.groupby('listing_id').agg({
    'date': lambda x: (x.max() - x.min()).days / 365.25,  # Get range of dates
    'sentiment_score_1_5': 'mean',  # Average sentiment and count
    'reviewer_id': 'nunique' 
}).rename(columns={
    'date': 'active_period_years',
    'sentiment_score_1_5': 'avg_sentiment',
    'reviewer_id': 'n_bookings'
}).reset_index()

# Merge with listings (now one-to-one)
lis_1t1 = lis_df.merge(
    reviews_agg,
    left_on='id',
    right_on='listing_id',
    how='left'
)

### smth

In [None]:
lis_1t1[['id','review_scores_rating','avg_sentiment']]

In [None]:
# lis_1t1.drop(columns=[ 'host_identity_verified', 'instant_bookable'], inplace=True)
# lis_1t1.info()

In [None]:
# 'host_has_profile_pic', 'description'

In [None]:
#'host_acceptance_rate_pct',

In [None]:
lis_1t1.drop(columns=['name','description','host_response_rate_pct','neighbourhood_cleansed',  'has_availability', 'listing_id', 'host_since'], inplace=True) 

lis_1t1.drop(columns=['price_vs_room_type', 'price_per_person', 'price_per_bedroom', 'neighborhood_avg_price','price_vs_neighborhood','room_type_avg_price', 'host_listings_ratio' ], inplace=True)
lis_1t1.info()

In [None]:
lis_df['host_identity_verified']

In [None]:
reviews = rev_df.groupby('listing_id')[['date', 'sentiment_score_1_5', 'comments', 'reviewer_id']].reset_index()

# Merge with listings
lis_df = lis_df.merge(
    reviews,
    left_on='id',
    right_on='listing_id',
    how='left'
)

In [None]:
rev_df[rev_df['listing_id'] == 1188302911099164911]['comments']

In [None]:
# Count missing avg_sentiment values
missing_sentiment = lis_1t1['avg_sentiment'].isna().sum()
total_rows = len(lis_1t1)
print(f"Missing sentiment scores: {missing_sentiment} out of {total_rows} rows ({missing_sentiment/total_rows:.1%})")

In [None]:
# Count missing review_scores_rating values
missing_ratings = lis_1t1['review_scores_rating'].isna().sum()
total_rows = len(lis_1t1)
print(f"Missing review scores: {missing_ratings} out of {total_rows} rows ({missing_ratings/total_rows:.1%})")


In [None]:
review_columns = [col for col in lis_df.columns if 'review' in col.lower()]
print("Columns containing 'review':")
for col in review_columns:
    print(f"- {col}")


In [None]:
#TODO in SQL
# Response Quality: encode ordinal rating of each possible response time category
# lis_df['host_response_time']

# How many listings a host own (listings_count)

In [None]:
def analyze_missing_values(df, df_name):
    # Calculate missing values
    missing = df.isnull().sum()
    missing_percent = (df.isnull().sum() / len(df)) * 100
    
    # Create a summary DataFrame
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Missing Percentage': missing_percent.round(2)
    })
    
    # Only show columns with missing values, sorted by percentage
    missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values(
        'Missing Percentage', ascending=False
    )
    
    print(f"\nMissing Values Analysis for {df_name}:")
    print("-" * 50)
    if len(missing_info) > 0:
        print(missing_info)
    else:
        print("No missing values found!")
    print(f"\nTotal rows in dataset: {len(df)}")

# Analyze both datasets
analyze_missing_values(lis_df, "Listings")
analyze_missing_values(rev_df, "Reviews")

In [None]:
# Check missing values in reviews.comments
print("\nMissing values in reviews.beds:")
print(f"Number of missing beds: {lis_df['beds'].isna().sum()}")
print(f"Percentage missing: {(lis_df['beds'].isna().sum() / len(lis_df) * 100):.2f}%")

# Show sample of reviews with missing comments
print("\nSample of reviews with missing beds:")
display(lis_df[lis_df['beds'].isna()].head())
# Display beds and bathrooms for rows where beds are missing
print("\nBeds and bathrooms for listings with missing beds:")
display(lis_df[lis_df['beds'].isna()][['beds', 'bathrooms', 'bathrooms_text']])



In [None]:
lis_df[lis_df.columns[lis_df.columns.str.contains('reviews_')]]

In [None]:
# Handle half baths and NAs in bathrooms_text
# First convert to float to avoid dtype warning
half_bath_mask = lis_df['bathrooms_text'].isin(['Half-bath', 'Shared half-bath', 'Private half-bath'])
lis_df.loc[half_bath_mask, 'bathrooms_text'] = lis_df.loc[half_bath_mask, 'bathrooms_text'].astype(float).fillna(0.5)

# Handle NAs with explicit float conversion
lis_df['bathrooms_text'] = lis_df['bathrooms_text'].astype(float).fillna(0.0)

# Extract numeric values from bathrooms_text
# Convert to string first before using str accessor
lis_df['bathrooms_text'] = lis_df['bathrooms_text'].astype(str).str.extract(r'^(\d+\.?\d?)').astype(float)

# Fill NaN bathrooms values with bathrooms_text values
lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms'] = lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms_text']
print(len(lis_df['bathrooms_text'].unique()), lis_df['bathrooms_text'].unique())

In [None]:
lis_df['bathrooms_text'].unique()
# Aggregate reviews before merging
lis_df[['bathrooms', 'bathrooms_text']].dtypes

# Show sample where bathrooms and bathrooms_text values differ and get total count
diff_bath = lis_df[['bathrooms', 'bathrooms_text']].loc[lis_df['bathrooms'] != lis_df['bathrooms_text']]
print(f"Total rows with different values: {len(diff_bath)}")
print("\nSample of differences:")
display(diff_bath[:5])

In [None]:
# Fill NaN bathrooms values with bathrooms_text values
lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms'] = lis_df.loc[lis_df['bathrooms'].isna(), 'bathrooms_text']

___

In [2]:
import pandas as pd

In [None]:
rev_df = pd.read_parquet('data/processed/03_reviews.parquet')
rev_s = pd.read_parquet('data/processed/04_sentiment_bert.parquet')

In [None]:
rev_df.sort_values('id')[:2]

In [None]:
rev_s.sort_values('id')[:2]

In [None]:

len(rev_s['sentiment_score_1_5'].unique())

___

In [None]:
# Find all columns containing 'host_listings' or matching 'host_total_listings_count'
host_listing_cols = [col for col in lis_df.columns if 'host_listings' in col or col == 'host_total_listings_count']
print("Columns containing 'host_listings' or 'host_total_listings_count':")
print(host_listing_cols)

In [None]:
lis_df[host_listing_cols][:5]

In [None]:
# Compare the three different host listing count columns
listing_counts = lis_df[['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count']]

# Find cases where they are all equal
all_equal = (listing_counts['host_listings_count'] == listing_counts['host_total_listings_count']) & \
            (listing_counts['host_total_listings_count'] == listing_counts['calculated_host_listings_count'])

print("Cases where all counts are equal:", sum(all_equal))
print("\nCases where counts differ:")
print(listing_counts[~all_equal].head())
print(f"\nTotal cases where counts differ: {sum(~all_equal)}")

# Check if host_total_listings_count is always the largest
is_largest = (listing_counts['host_total_listings_count'] >= listing_counts['host_listings_count']) & \
             (listing_counts['host_total_listings_count'] >= listing_counts['calculated_host_listings_count'])
print(f"\nIs host_total_listings_count always the largest? {is_largest.all()}")

if not is_largest.all():
    print("\nCases where host_total_listings_count is not the largest:")
    print(listing_counts[~is_largest].head())


___

___

Various types of analyses to understand the data better and to prepare for the database design.

In [None]:
price_cols = [col for col in lis_df.columns if col.startswith('yearly')]
print("Price-related columns:", price_cols)

In [None]:
lis_df['days_since_last_review']

In [None]:
review_cols = ['total_reviews', 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_l30d', 'yearly_review', 'number_of_reviews_ltm']
print("Review-related columns from listings:")
print(lis_df[review_cols].head())