<center><b>Group D2: Exploratory Data Analysis on New York City Airbnb Listings in 2019<b><center>

## Data Cleaning

#### IMPORT LIBRARIES AND READ THE DATASET

In [None]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import plotly.express as px
import plotly.graph_objs as go
import statsmodels.api as sm

In [None]:
# Import the dataset

data = pd.read_csv('https://raw.githubusercontent.com/Exploratory-Data-Analysis/New-York-City-Airbnb-Investment-Recommendation/main/AB_NYC_2019.csv')
data.head()

#### Data Types

In [None]:
# inspect datatypes
display(data.info())

In [None]:
# change data types

data.last_review = pd.to_datetime(data.last_review)
print(data.last_review.dtype)

#### Missing Values

In [None]:
# check count of missing values
data.isnull().sum().sort_values(ascending = False).apply(lambda x: x if x > 0 else None).dropna()

In [None]:
# check ratio of missing values
data.isnull().mean().sort_values(ascending = False).apply(lambda x: x if x > 0 else None).dropna()

In [None]:
# drop missing values
data = data.dropna(subset = ['name', 'host_name'])

Replace `NaN` values with `0` on the `reviews_per_month` column and leave the `last_review` the same.

In [None]:
# fill missing values
data = data.fillna({'reviews_per_month':0})

Validate accuracy of reviews_per_month by comparing the count of 0 reviews.

In [None]:
# validate accuracy
len(data['number_of_reviews'] == 0) == len(data['reviews_per_month'] == 0)

In [None]:
# create a list with the corresponding values
with_review = [1 if i > 0 else 0 for i in data.reviews_per_month ]

# get the index location of the column - useful for inserting a column in a df
loc = data.columns.get_loc('number_of_reviews')

# insert with_review column
data.insert(loc, 'with_review', with_review)
data.head()

#### Further Cleaning

In [None]:
# validate numerical columns
data.describe(exclude = object)

Based on the result, there are listings with zero prices which is not right so we need to filter and drop these listings.

In [None]:
#filter zero price listings
zero_price = data['price'] == 0

print(f"No. of listing with no price: {data[zero_price].shape[0]}")
display(data[zero_price].head())

In [None]:
# drop values with 0 price

data = data.query("price > 0")

In [None]:
# separate relevant numeric columns
numeric = ['price', 'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count', 'reviews_per_month', 'availability_365']

## Data Analysis

Visualize statistical values in the dataframe for easier understanding.

In [None]:
# melt the dataframe
data_melt = pd.melt(data, id_vars= 'id', value_vars = numeric)

# compute statistics on the melted dataframe
data_group = data_melt.groupby('variable').agg({'value':['min', 'max', 'mean' , 'median', 'std']})\
            .sort_values(('value', 'mean'), ascending = False)
data_group

In [None]:
# Aggregating data to find average price per room type for each neighborhood group
price_analysis = (
    data.groupby(['neighbourhood_group', 'room_type'])
    .agg(avg_price=('price', 'mean'), count=('price', 'size'))
    .reset_index())

# Sorting the aggregated data by avg_price in descending order
price_analysis = price_analysis.sort_values(by='avg_price', ascending=False)

# Displaying the price analysis table using standard Python print
print("Price Analysis by Room Type and Neighborhood Group:")
print(price_analysis)

## 1. Proportions of Listings by Neighborhood Groups


In [None]:
# Group data by neighborhood group and calculate proportions
neighborhood_proportion = data.groupby('neighbourhood_group').size().reset_index(name='count')

# Define custom colors for neighborhoods
custom_colors = {
    'Manhattan': 'red',
    'Brooklyn': 'blue',
    'Queens': 'green',
    'Bronx': 'orange',
    'Staten Island': 'purple'}

# Create the pie chart
neighborhood_pie = px.pie(
    neighborhood_proportion,
    names='neighbourhood_group',
    values='count',
    color='neighbourhood_group',
    color_discrete_map=custom_colors,  
    title='Proportions of Listings by Neighborhood Groups',
    hole=0.4)

# Show the pie chart
neighborhood_pie.show()

## 2. Price Distribution by Room Type

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data, x='room_type', y='price', hue='room_type', palette='Set3', dodge=False)
plt.legend([], [], frameon=False) 
plt.title('Price Distribution by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Price')
plt.yscale('log')  
plt.show()

## 3. Average Price by Neighborhood Group and Room Type

In [None]:
avg_price_plot = px.bar(
    price_analysis,
    x='neighbourhood_group',
    y='avg_price',
    color='room_type',
    barmode='group',
    title='Average Price by Neighborhood Group and Room Type',
    labels={'avg_price': 'Average Price ($)', 'neighbourhood_group': 'Neighborhood Group'})
avg_price_plot.update_layout(
    xaxis=dict(showgrid=False),  
    yaxis=dict(showgrid=False, range=[0, 300]), 
    title=dict(x=0.5))
avg_price_plot.show()

## 4. Scatter Mapbox: Price and Mimimum Nights Across Neighborhoods

In [None]:
fig2 = px.scatter_mapbox(
    data, lat='latitude', 
    lon='longitude', 
    color='price', size='minimum_nights', 
    hover_name='name', 
    hover_data={'price': True, 'room_type': True, 'neighbourhood_group': True}, 
    color_continuous_scale="Viridis", range_color=[50, 700], size_max=10, zoom=10, 
    mapbox_style="carto-positron", opacity=0.6, title="Scatter Mapbox: Price and Minimum Nights", height=800) 

fig2.show()

## 5. Count of Listings by Room Type Across Neighborhood Groups

In [None]:
count_plot = px.bar(
    price_analysis,
    x='neighbourhood_group',
    y='count',
    color='room_type',
    barmode='group',
    title='Count of Listings by Neighborhood Group and Room Type',
    labels={'count': 'Number of Listings', 'neighbourhood_group': 'Neighborhood Group'},
    text='count')
count_plot.update_layout(
    xaxis=dict(showgrid=False),  
    yaxis=dict(showgrid=False, range=[0, 15000]), 
    title=dict(x=0.5))
count_plot.update_traces(
    texttemplate='%{text}',  
    textposition='outside')
count_plot.show()

## 6. Price Variation Across Neighborhoods

In [None]:
neighborhood_price_plot = px.box(
    data,
    x='neighbourhood_group',
    y='price',
    color='neighbourhood_group',
    title='Price Variation Across Neighborhood Groups',
    labels={'price': 'Price ($)', 'neighbourhood_group': 'Neighborhood Group'},
    log_y=True)

neighborhood_price_plot.update_layout(
    xaxis=dict(showgrid=False),  
    yaxis=dict(showgrid=False), 
    title=dict(x=0.5))

neighborhood_price_plot.show()

## 7. Correlation Heatmap of Numerical Features

In [None]:
numerical_columns = ['price', 'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count', 'reviews_per_month', 'availability_365']
correlation_matrix = data[numerical_columns].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

## 8. Scatter Plot: Price vs Availability for Different Room Types

In [None]:
scatter_plot = px.scatter(
    data,
    x='availability_365',
    y='price',
    color='room_type',
    size='number_of_reviews',
    title='Price vs Availability for Different Room Types',
    labels={'availability_365': 'Availability (Days)', 'price': 'Price ($)'},
    log_y=True,
)
scatter_plot.show()

## 9. Regression Analysis

### 9.1 Regression Analysis in all neighbourhood

In [None]:
# Select variables
data = data[data['price'] <= 1000] # remove outliers
X = data[['neighbourhood_group', 'room_type', 'reviews_per_month', 'availability_365','calculated_host_listings_count']]
y = data['price']
X = pd.get_dummies(X, columns=['neighbourhood_group', 'room_type'], drop_first=True)
X = X.astype(float) 
y = y.astype(float)
 
X = sm.add_constant(X)
X = X.dropna()  
 
y = data['price'].loc[X.index] 
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
numerical_vars = ['reviews_per_month', 'availability_365', 'calculated_host_listings_count']
dummy_vars = ['neighbourhood_group', 'room_type']

# color setting
color_mapping_neighbourhood = {
    'Brooklyn': '#636EFA',      
    'Manhattan': '#EF553B',     
    'Queens': '#00CC96',        
    'Staten Island': '#AB63FA', 
    'Bronx': '#FFA15A'          }

color_mapping_room_type = {'Entire home/apt': '#636EFA', 'Private room': '#EF553B', 'Shared room': '#00CC96'}

# set the graphs
fig, axes = plt.subplots(2, max(len(numerical_vars), len(dummy_vars)), figsize=(5 * max(len(numerical_vars), len(dummy_vars)), 10))

# 1st row: numerical
for i, var in enumerate(numerical_vars):
    sns.regplot(x=data[var], y=data['price'], scatter_kws={'s':10}, line_kws={'color':'red'}, ax=axes[0, i])
    axes[0, i].set_title(f'{var} vs Price')
    axes[0, i].set_xlabel(var)
    axes[0, i].set_ylabel('Price')

# 2nd row：Dummy 
for i, var in enumerate(dummy_vars):
    if var == 'neighbourhood_group':
        sns.boxplot(x=data[var], y=data['price'], ax=axes[1, i], palette=color_mapping_neighbourhood)
    elif var == 'room_type':
        sns.boxplot(x=data[var], y=data['price'], ax=axes[1, i], palette=color_mapping_room_type)
    axes[1, i].set_title(f'{var} (Dummy) vs Price')
    axes[1, i].set_xlabel(var)
    axes[1, i].set_ylabel('Price')

# hide the redundant graphs
if len(numerical_vars) < len(dummy_vars):
    for j in range(len(numerical_vars), len(dummy_vars)):
        fig.delaxes(axes[0, j])
elif len(dummy_vars) < len(numerical_vars):
    for j in range(len(dummy_vars), len(numerical_vars)):
        fig.delaxes(axes[1, j])

plt.tight_layout()
plt.show()

### 9.2 Regression Analysis in different neighbourhood

#### A. Regression in Brooklyn

In [None]:
Brooklyn = data[data['neighbourhood_group']=='Brooklyn']

X = Brooklyn[['room_type', 'reviews_per_month', 'availability_365','calculated_host_listings_count']]
y = Brooklyn['price']
X = pd.get_dummies(X, columns=['room_type'], drop_first=True)
X = X.astype(float) 
y = y.astype(float)
 
X = sm.add_constant(X)
X = X.dropna()  
 
y = data['price'].loc[X.index] 
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
numerical_vars = ['reviews_per_month', 'availability_365', 'calculated_host_listings_count']
dummy_var = 'room_type' 
all_vars = numerical_vars + [dummy_var]


fig, axes = plt.subplots(1, len(all_vars), figsize=(4 * len(all_vars), 4))
color_mapping_room_type = {'Entire home/apt': '#636EFA', 'Private room': '#EF553B', 'Shared room': '#00CC96'}

# Draw
for i, var in enumerate(all_vars):
    if var in numerical_vars:
        # numerical
        sns.regplot(x=X[var], y=y, scatter_kws={'s':10}, line_kws={'color':'red'}, ax=axes[i])
        axes[i].set_title(f'{var} vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')
    elif var == dummy_var:
        # Dummy 
        sns.boxplot(
            x=Brooklyn[var], 
            y=Brooklyn['price'], 
            ax=axes[i], 
            palette=color_mapping_room_type, 
            hue=Brooklyn[var]  
        )
        axes[i].set_title(f'Room_Type vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')

plt.tight_layout()
plt.show()

#### B. Regression in Manhattan

In [None]:
Manhattan = data[data['neighbourhood_group']=='Manhattan']

X = Manhattan[['room_type', 'reviews_per_month', 'availability_365','calculated_host_listings_count']]
y = Manhattan['price']
X = pd.get_dummies(X, columns=['room_type'], drop_first=True)
X = X.astype(float) 
y = y.astype(float)
 
X = sm.add_constant(X)
X = X.dropna()  
 
y = data['price'].loc[X.index] 
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
numerical_vars = ['reviews_per_month', 'availability_365', 'calculated_host_listings_count']
dummy_var = 'room_type' 
all_vars = numerical_vars + [dummy_var]


fig, axes = plt.subplots(1, len(all_vars), figsize=(4 * len(all_vars), 4))
color_mapping_room_type = {'Entire home/apt': '#636EFA', 'Private room': '#EF553B', 'Shared room': '#00CC96'}

# Draw
for i, var in enumerate(all_vars):
    if var in numerical_vars:
        # numerical
        sns.regplot(x=X[var], y=y, scatter_kws={'s':10}, line_kws={'color':'red'}, ax=axes[i])
        axes[i].set_title(f'{var} vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')
    elif var == dummy_var:
        # Dummy 
        sns.boxplot(
            x=Manhattan[var], 
            y=Manhattan['price'], 
            ax=axes[i], 
            palette=color_mapping_room_type, 
            hue=Manhattan[var]  
        )
        axes[i].set_title(f'Room_Type vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')

plt.tight_layout()
plt.show()

### C. Regression in Queens

In [None]:
Queens = data[data['neighbourhood_group']=='Queens']

X = Queens[['room_type', 'reviews_per_month', 'availability_365','calculated_host_listings_count']]
y = Queens['price']
X = pd.get_dummies(X, columns=['room_type'], drop_first=True)
X = X.astype(float) 
y = y.astype(float)
 
X = sm.add_constant(X)
X = X.dropna()  
 
y = data['price'].loc[X.index] 
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
numerical_vars = ['reviews_per_month', 'availability_365', 'calculated_host_listings_count']
dummy_var = 'room_type' 
all_vars = numerical_vars + [dummy_var]


fig, axes = plt.subplots(1, len(all_vars), figsize=(4 * len(all_vars), 4))
color_mapping_room_type = {'Entire home/apt': '#636EFA', 'Private room': '#EF553B', 'Shared room': '#00CC96'}

# Draw
for i, var in enumerate(all_vars):
    if var in numerical_vars:
        # numerical
        sns.regplot(x=X[var], y=y, scatter_kws={'s':10}, line_kws={'color':'red'}, ax=axes[i])
        axes[i].set_title(f'{var} vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')
    elif var == dummy_var:
        # Dummy 
        sns.boxplot(
            x=Queens[var], 
            y=Queens['price'], 
            ax=axes[i], 
            palette=color_mapping_room_type, 
            hue=Queens[var] 
        )
        axes[i].set_title(f'Room_Type vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')

plt.tight_layout()
plt.show()

#### D. Regression in Staten Island

In [None]:
Staten_Island = data[data['neighbourhood_group']=='Staten Island']

X = Staten_Island[['room_type', 'reviews_per_month', 'availability_365','calculated_host_listings_count']]
y = Staten_Island['price']
X = pd.get_dummies(X, columns=['room_type'], drop_first=True)
X = X.astype(float) 
y = y.astype(float)
 
X = sm.add_constant(X)
X = X.dropna()  
 
y = data['price'].loc[X.index] 
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
numerical_vars = ['reviews_per_month', 'availability_365', 'calculated_host_listings_count']
dummy_var = 'room_type' 
all_vars = numerical_vars + [dummy_var]


fig, axes = plt.subplots(1, len(all_vars), figsize=(4 * len(all_vars), 4))
color_mapping_room_type = {'Entire home/apt': '#636EFA', 'Private room': '#EF553B', 'Shared room': '#00CC96'}

# Draw
for i, var in enumerate(all_vars):
    if var in numerical_vars:
        # numerical
        sns.regplot(x=X[var], y=y, scatter_kws={'s':10}, line_kws={'color':'red'}, ax=axes[i])
        axes[i].set_title(f'{var} vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')
    elif var == dummy_var:
        # Dummy 
        sns.boxplot(
            x=Staten_Island[var], 
            y=Staten_Island['price'], 
            ax=axes[i], 
            palette=color_mapping_room_type, 
            hue=Staten_Island[var] 
        )
        axes[i].set_title(f'Room_Type vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')

plt.tight_layout()
plt.show()

#### E. Regression in Bronx

In [None]:
Bronx = data[data['neighbourhood_group']=='Bronx']

X = Bronx[['room_type', 'reviews_per_month', 'availability_365','calculated_host_listings_count']]
y = Bronx['price']
X = pd.get_dummies(X, columns=['room_type'], drop_first=True)
X = X.astype(float) 
y = y.astype(float)
 
X = sm.add_constant(X)
X = X.dropna()  
 
y = data['price'].loc[X.index] 
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
numerical_vars = ['reviews_per_month', 'availability_365', 'calculated_host_listings_count']
dummy_var = 'room_type' 
all_vars = numerical_vars + [dummy_var]


fig, axes = plt.subplots(1, len(all_vars), figsize=(4 * len(all_vars), 4))
color_mapping_room_type = {'Entire home/apt': '#636EFA', 'Private room': '#EF553B', 'Shared room': '#00CC96'}

# Draw
for i, var in enumerate(all_vars):
    if var in numerical_vars:
        # numerical
        sns.regplot(x=X[var], y=y, scatter_kws={'s':10}, line_kws={'color':'red'}, ax=axes[i])
        axes[i].set_title(f'{var} vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')
    elif var == dummy_var:
        # Dummy 
        sns.boxplot(
            x=Bronx[var], 
            y=Bronx['price'], 
            ax=axes[i], 
            palette=color_mapping_room_type, 
            hue=Bronx[var] 
        )
        axes[i].set_title(f'Room_Type vs Price')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Price')

plt.tight_layout()
plt.show()

### 9.3 Compare regression in different neighbourhood

#### A. reviews_per_month vs Price in different neighbourhood

In [None]:
neighborhoods = data['neighbourhood_group'].unique()
fig, axes = plt.subplots(1, len(neighborhoods), figsize=(6 * len(neighborhoods), 5), sharey=True)

for i, neighborhood in enumerate(neighborhoods):
    subset = data[data['neighbourhood_group'] == neighborhood] 
    sns.regplot(
        x=subset['reviews_per_month'], 
        y=subset['price'], 
        scatter_kws={'s':10, 'color': color_mapping_neighbourhood[neighborhood]},  
        line_kws={'color':'red'}, 
        ax=axes[i]
    )
    axes[i].set_title(f'{neighborhood}: reviews_per_month vs Price')
    axes[i].set_xlabel('Reviews per Month')
    axes[i].set_ylabel('Price' if i == 0 else "") 

plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px

fig = px.scatter(
    data, 
    x='reviews_per_month', 
    y='price', 
    color='neighbourhood_group', 
    color_discrete_map=color_mapping_neighbourhood,
    title='Reviews per Month vs Price by Neighborhood',
    trendline='ols',  # add the regression line
    labels={'reviews_per_month': 'Reviews per Month', 'price': 'Price', 'neighbourhood_group': 'Neighborhood Group'},
    opacity=0.1
)

fig.update_traces(marker=dict(size=6), selector=dict(mode='markers'))

fig.update_layout(
    xaxis_title="Reviews per Month",
    yaxis_title="Price",
    legend_title="Neighborhood Group"
)

fig.show()

#### B. availability_365 vs Price in different neighbourhood

In [None]:
fig, axes = plt.subplots(1, len(neighborhoods), figsize=(6 * len(neighborhoods), 5), sharey=True)
for i, neighborhood in enumerate(neighborhoods):
    subset = data[data['neighbourhood_group'] == neighborhood]
    sns.regplot(
        x=subset['availability_365'], 
        y=subset['price'], 
        scatter_kws={'s':10, 'color': color_mapping_neighbourhood[neighborhood]}, 
        line_kws={'color':'red'}, 
        ax=axes[i]
    )
    axes[i].set_title(f'{neighborhood}: availability_365 vs Price')
    axes[i].set_xlabel('Availability (365)')
    axes[i].set_ylabel('Price' if i == 0 else "") 

plt.tight_layout()
plt.show()

In [None]:
fig = px.scatter(
    data, 
    x='availability_365', 
    y='price', 
    color='neighbourhood_group', 
    color_discrete_map=color_mapping_neighbourhood,
    title='Availability (365) vs Price by Neighborhood',
    trendline='ols', 
    labels={'availability_365': 'Availability (365)', 'price': 'Price', 'neighbourhood_group': 'Neighborhood Group'},
    opacity=0.1
)

fig.update_traces(marker=dict(size=6), selector=dict(mode='markers'))

fig.update_layout(
    xaxis_title="Availability (365)",
    yaxis_title="Price",
    legend_title="Neighborhood Group"
)

fig.show()

#### C. calculated_host_listings_count vs Price in different neighbourhood

In [None]:
fig, axes = plt.subplots(1, len(neighborhoods), figsize=(6 * len(neighborhoods), 5), sharey=True)

for i, neighborhood in enumerate(neighborhoods):
    subset = data[data['neighbourhood_group'] == neighborhood]
    sns.regplot(
        x=subset['calculated_host_listings_count'], 
        y=subset['price'], 
        scatter_kws={'s':10, 'color': color_mapping_neighbourhood[neighborhood]}, 
        line_kws={'color':'red'}, 
        ax=axes[i]
    )
    axes[i].set_title(f'{neighborhood}: calculated_host_listings_count vs Price')
    axes[i].set_xlabel('Host Listings Count')
    axes[i].set_ylabel('Price' if i == 0 else "")  

plt.tight_layout()
plt.show()

In [None]:
fig = px.scatter(
    data, 
    x='calculated_host_listings_count', 
    y='price', 
    color='neighbourhood_group', 
    color_discrete_map=color_mapping_neighbourhood,
    title='Calculated Host Listings Count vs Price by Neighborhood',
    trendline='ols',  
    labels={'calculated_host_listings_count': 'Host Listings Count', 'price': 'Price', 'neighbourhood_group': 'Neighborhood Group'},
    opacity=0.1
)

fig.update_traces(marker=dict(size=6), selector=dict(mode='markers'))

fig.update_layout(
    xaxis_title="Host Listings Count",
    yaxis_title="Price",
    legend_title="Neighborhood Group"
)

fig.show()

#### D. room_type vs Price in different neighbourhood

In [None]:
import plotly.express as px

fig = px.box(
    data, 
    x='room_type', 
    y='price', 
    color='neighbourhood_group', 
    color_discrete_map=color_mapping_neighbourhood,
    title='Room Type vs Price by Neighborhood',
    labels={'room_type': 'Room Type', 'price': 'Price', 'neighbourhood_group': 'Neighborhood Group'}
)

fig.update_layout(
    xaxis_title="Room Type",
    yaxis_title="Price",
    legend_title="Neighborhood Group"
)

fig.show()

## 10. Hypothesis Tests

### A. Is there a difference in prices between homes with and without reviews?

In [None]:
wreview = data[data['with_review']==1]['price']
woreview = data[data['with_review']==0]['price']
stats.ttest_ind(a=wreview, b=woreview, equal_var=True)

With a p-value much smaller than 0.05, we can confidently say that there is a difference in prices between homes with and without reviews.

### B. Is there a difference in prices between entire homes/apts and private rooms?

In [None]:
eh = data[data['room_type']=='Entire home/apt']['price']
pr = data[data['room_type']=='Private room']['price']
stats.ttest_ind(a=eh, b=pr, equal_var=True)

With a p-value much smaller than 0.05, we can confidently say that there is a difference in prices between entire homes/apts and private rooms.

### C. Is there a difference in prices between private rooms and shared rooms?

In [None]:
sr = data[data['room_type']=='Shared room']['price']
stats.ttest_ind(a=sr, b=pr, equal_var=True)

Similarly, there is a difference in prices between private rooms and shared rooms.

### D. Is there a difference in prices between homes with minimum nights below and above 10?

In [None]:
below10 = data[data['minimum_nights']>=10]['price']
above10 = data[data['minimum_nights']<10]['price']
stats.ttest_ind(a=below10, b=above10, equal_var=True)

With a p-value much smaller than 0.05, we can confidently say that there is a difference in prices between homes with minimum nights below and above 10.

### E. Based on the boxplot, Brooklyn and Manhattan have the highest prices. Is there a difference in prices between homes in Brooklyn and Manhattan?

In [None]:
brooklyn = data[data['neighbourhood_group']=='Brooklyn']['price']
manhattan = data[data['neighbourhood_group']=='Manhattan']['price']
stats.ttest_ind(a=brooklyn, b=manhattan, equal_var=True)

With a p-value much smaller than 0.05, we can confidently say that there is a difference in prices between homes in Brooklyn and Manhattan.

### F. Are neighborhood group and room type independent?

In [None]:
table=pd.crosstab(data['neighbourhood_group'],data['room_type']) 
table

In [None]:
chi2, p, dof, expected = stats.chi2_contingency(table.fillna(0))

# Print the results
print(f"Chi-square Statistic: {chi2}")
print(f"p-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

With a p-value much smaller than 0.05, we can confidently say that neighborhood group and room type are dependent.