In [None]:
import piplite
await piplite.install(['numpy'],['pandas'])
await piplite.install(['seaborn'])

In [None]:
import pandas as pd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Import necessary libraries
import numpy as np

In [None]:
from js import fetch
import io

URL = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv'
resp = await fetch(URL)
boston_url = io.BytesIO((await resp.arrayBuffer()).to_py())

In [None]:
boston_df=pd.read_csv(boston_url)

In [None]:
boston_df.info()

In [None]:
print(boston_df)

In [None]:
boston_df

In [None]:
# Task 1: Boxplot for Median Value of Owner-Occupied Homes (MEDV)
plt.figure(figsize=(8, 6))
sns.boxplot(boston_df=boston_df, y='MEDV', color='skyblue')
plt.title('Boxplot of Median Value of Owner-Occupied Homes (MEDV)', fontsize=14)
plt.ylabel('Median Value ($1000\'s)', fontsize=12)
plt.show()

# Task 2: Bar Plot for Charles River Variable (CHAS)
plt.figure(figsize=(8, 6))
sns.countplot(boston_df=boston_df, x='CHAS', palette='viridis')
plt.title('Bar Plot for Charles River Variable', fontsize=14)
plt.xlabel('Charles River (1 = Yes, 0 = No)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

# Task 3: Boxplot of MEDV vs Discretized AGE
# Discretize AGE into 3 groups
bins = [0, 35, 70, 100]
labels = ['35 years and younger', 'Between 35 and 70 years', '70 years and older']
data['AGE_Group'] = pd.cut(data['AGE'], bins=bins, labels=labels)

plt.figure(figsize=(8, 6))
sns.boxplot(boston_df=boston_df, x='AGE_Group', y='MEDV', palette='coolwarm')
plt.title('Boxplot of MEDV by Age Groups', fontsize=14)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Median Value ($1000\'s)', fontsize=12)
plt.show()

# Task 4: Scatter Plot of NOX vs INDUS
plt.figure(figsize=(8, 6))
sns.scatterplot(boston_df=boston_df, x='INDUS', y='NOX', color='purple')
plt.title('Scatter Plot of NOX vs INDUS', fontsize=14)
plt.xlabel('Proportion of Non-Retail Business Acres (INDUS)', fontsize=12)
plt.ylabel('Nitric Oxide Concentrations (NOX)', fontsize=12)
plt.show()

# Task 5: Histogram for Pupil-to-Teacher Ratio (PTRATIO)
plt.figure(figsize=(8, 6))
sns.histplot(boston_df=boston_df, x='PTRATIO', kde=True, color='green', bins=15)
plt.title('Histogram of Pupil-to-Teacher Ratio', fontsize=14)
plt.xlabel('Pupil-to-Teacher Ratio', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

In [None]:
# Question: Is there a significant difference in the median value of houses bounded by the Charles river or not?
# 1. T-test for Independent Samples:
# Null Hypothesis (H₀): There is no significant difference in the median value of houses based on whether they are bounded by the Charles River or not.
# Alternative Hypothesis (H₁): There is a significant difference in the median value of houses based on whether they are bounded by the Charles River.

from scipy.stats import ttest_ind

# Group data by CHAS (Charles River)
chas_0 = boston_df[boston_df['CHAS'] == 0]['MEDV']
chas_1 = boston_df[boston_df['CHAS'] == 1]['MEDV']

# Perform an independent samples t-test
t_stat, p_value = ttest_ind(chas_0, chas_1)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Conclusion
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in median value of houses.")
else:
    print("Fail to reject the null hypothesis: No significant difference in median value of houses.")

In [None]:
# Question: Is there a difference in Median values of houses (MEDV) for each proportion of owner-occupied units built prior to 1940 (AGE)?
# 2. ANOVA Test:
# Null Hypothesis (H₀): The mean MEDV is the same across all age groups.
# Alternative Hypothesis (H₁): At least one group has a different mean MEDV.

from scipy.stats import f_oneway

# Create age groups
data['AGE_Group'] = pd.cut(data['AGE'], bins=[0, 35, 70, 100], labels=['0-35', '35-70', '70+'])

# Group data by AGE_Group
age_group_1 = boston_df[boston_df['AGE_Group'] == '0-35']['MEDV']
age_group_2 = boston_df[boston_df['AGE_Group'] == '35-70']['MEDV']
age_group_3 = boston_df[boston_df['AGE_Group'] == '70+']['MEDV']

# Perform ANOVA
f_stat, p_value = f_oneway(age_group_1, age_group_2, age_group_3)

print(f"F-statistic: {f_stat}, P-value: {p_value}")

# Conclusion
if p_value < 0.05:
    print("Reject the null hypothesis: There is a difference in median values of houses across age groups.")
else:
    print("Fail to reject the null hypothesis: No significant difference in median values across age groups.")

In [None]:
#Question: Can we conclude that there is no relationship between Nitric oxide concentrations and the proportion of non-retail business acres per town?
#3. Pearson Correlation:
#Null Hypothesis (H₀): There is no correlation between NOX (Nitric oxide concentration) and INDUS (proportion of non-retail business acres).
#Alternative Hypothesis (H₁): There is a correlation between NOX and INDUS.

from scipy.stats import pearsonr

# Calculate Pearson correlation
corr, p_value = pearsonr(boston_df['NOX'], data['INDUS'])

print(f"Correlation coefficient: {corr}, P-value: {p_value}")

# Conclusion
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant correlation between NOX and INDUS.")
else:
    print("Fail to reject the null hypothesis: No significant correlation between NOX and INDUS.")

In [None]:
#Question: What is the impact of an additional weighted distance to the five Boston employment centres (DIS) on the median value of owner-occupied homes?
#4. Regression Analysis:

#Null Hypothesis (H₀): The weighted distance to employment centers has no impact on the median value of homes.
#Alternative Hypothesis (H₁): The weighted distance to employment centers has an impact on the median value of homes.

import statsmodels.api as sm

# Define the independent variable (DIS) and dependent variable (MEDV)
X = boston_df['DIS']
y = boston_df['MEDV']

# Add a constant to the model
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print the summary
print(model.summary())

# Extract the p-value for DIS
p_value = model.pvalues['DIS']
coef = model.params['DIS']

# Conclusion
if p_value < 0.05:
    print(f"Reject the null hypothesis: DIS has a significant impact on MEDV. Coefficient: {coef}")
else:
    print(f"Fail to reject the null hypothesis: DIS has no significant impact on MEDV. Coefficient: {coef}")


In [None]:
#Expected Output:
#T-test: Displays whether houses bounded by the Charles River have significantly different median values.
#ANOVA Test: Indicates if median values differ significantly across age groups.
#Pearson Correlation: Shows the correlation coefficient and whether there is a significant relationship between NOX and INDUS.
#Regression Analysis: Provides the regression coefficient, p-value, and whether DIS significantly affects MEDV.