<a href="https://colab.research.google.com/github/diyizhilunba/Data-Science-Boot-Camp/blob/main/Tandon_DS_Bootcamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'new-york-city-airbnb-open-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F268833%2F611395%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240911%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240911T185149Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D56f8ff1cb2c78dcdd8e3ffbfd4ec8e132263e8d28c1ce94511f134d78957ba71ff2f5f382ef536280bec480f67f763e4dbf14b7b5090e5673c61b3323d771bd35e910576c580fe6804b10248f9930c4ca1ac9329530a943fd64587e173a96ca991bc02c410f2b40a0cd10d21b0e7631b3abc1d37670de1fedf022742b23c9464253043a1f4eac6abcfed6674e96a57ab2ce9debf1452b5e349fbfe46927d085a572a11cc7fd246a6c5f440d6e14be3d571c7d606fdd5138b5ce106a7ae137f26d436249cdfa907c6e7fab44e2dd256455296844adc8d1c328d4b049d791616135994af580de0f86d231aa66eee418935f2e4a8e5c14594a3654b31358e2ac66f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np

# Read Data and Create a Data Frame

In [None]:
url = '/kaggle/input/new-york-city-airbnb-open-data/AB_NYC_2019.csv'# use relative path
df = pd.read_csv(url)
df.head(5)

In [None]:
df.tail(5)

# Understand Data

In [None]:
df.info() # check entrie number, entrie index, conlum number, data type, and missing values for each column

In [None]:
df.isnull().sum() # check missing values for each column

# Fill Missing Data

In [None]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(df['reviews_per_month'].mean()) # fill missing values with mean value
df.isnull().sum() # check missing values for each column

In [None]:
df['reviews_per_month'].head() # 2 is filled with mean value

# Drop Duplicate Data

In [None]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# Drop Specified Columns

In [None]:
df.drop(['name','host_name','last_review','id','host_id'], axis=1, inplace=True) # drop columns that are not useful for analysis

In [None]:
df.isnull().sum()

# Check Correlation

In [None]:
# Creating a correlation matrix
correlation_matrix = df[['price', 'minimum_nights', 'number_of_reviews','reviews_per_month','availability_365','longitude','latitude']].corr()

# Plotting the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pedestrians, Temperature, and Precipitation')
plt.tight_layout()
plt.show()

In [None]:
# Drop either "reviews_per_month" or "number_of_reviews" column because of the high correlation
df.drop(['number_of_reviews'], axis=1, inplace=True)
df.isnull().sum()

# Validating expectations

In [None]:
df.describe() # check the basic statistics of the data

# Problem statement

**Goal**

Build a Machine Learning Model to Predict Airbnb Price
- Price

**Feature Selection Hypothesis**

- neighbourhood_group               
- neighbourhood                                 
- room_type                                                 
- minimum_nights                                   
- reviews_per_month                   
- availability_365    
- logitude
- latitude

# Data Visualization

In [None]:
#creating a sub-dataframe with no extreme values / less than 500
sub_6=df[df.price < 500]
#using violinplot to showcase density and distribtuion of prices
viz_2=sns.violinplot(data=sub_6, x='neighbourhood_group', y='price')
viz_2.set_title('Density and distribution of prices for each neighberhood_group')

In [None]:
#let's now combine this with our boroughs and room type for a rich visualization we can make

#grabbing top 10 neighbourhoods for sub-dataframe
sub_7=df.loc[df['neighbourhood'].isin(['Williamsburg','Bedford-Stuyvesant','Harlem','Bushwick',
                 'Upper West Side','Hell\'s Kitchen','East Village','Upper East Side','Crown Heights','Midtown'])]
#using catplot to represent multiple interesting attributes together and a count
viz_3=sns.catplot(x='neighbourhood', hue='neighbourhood_group', col='room_type', data=sub_7, kind='count')
viz_3.set_xticklabels(rotation=90)

In [None]:
plt.figure(figsize=(10,10))
ax = sns.boxplot(data=df, x='neighbourhood_group',y='availability_365')

In [None]:
from wordcloud import WordCloud
plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
                          background_color='white',
                          width=1920,
                          height=1080
                         ).generate(" ".join(df.neighbourhood))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('neighbourhood.png')
plt.show()

In [None]:
sns.pairplot(df, vars=['price', 'minimum_nights','reviews_per_month','availability_365','longitude','latitude'], plot_kws={'alpha':0.5, 'size': 0.1})# Sractch a pairplot to visualize the data


In [None]:
dis_legend = ['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx', '95% quantile', '99% quantile']

#Plot the density map of price by neighbourhood_group
sns.kdeplot(df, x='price', hue='neighbourhood_group', clip=(-10, 1000))

# Calculate the 95% quantile
q95 = np.quantile(df['price'], 0.95)
plt.axvline(q95, color='red', linestyle='--')
# Draw a vertical line at the 95% quantile
plt.axvline(q95, color='red', linestyle='--', label=f'95% Quantile: {q95:.2f}')

# Calculate the 99% quantile
q99 = np.quantile(df['price'], 0.99)
plt.axvline(q99, color='red', linestyle='--')
# Draw a vertical line at the 99% quantile
plt.axvline(q99, color='violet', linestyle='--', label=f'95% Quantile: {q99:.2f}')
plt.legend(dis_legend, loc='upper right')

In [None]:
type_lengend = ['Private room','Entire home/apt','Shared room']
#
sns.kdeplot(df,x = 'price', hue = 'room_type', clip = (-10,1000))
# Calculate the 95% quantile
q95 = np.quantile(df['price'], 0.95)
plt.axvline(q95, color='red', linestyle='--')
# Draw a vertical line at the 95% quantile
plt.axvline(q95, color='red', linestyle='--', label=f'95% Quantile: {q95:.2f}')

# Calculate the 99% quantile
q99 = np.quantile(df['price'], 0.99)
plt.axvline(q99, color='red', linestyle='--')
# Draw a vertical line at the 99% quantile
plt.axvline(q99, color='violet', linestyle='--', label=f'95% Quantile: {q99:.2f}')
plt.legend(type_lengend, loc='upper right')

In [None]:
sns.barplot(df,
            x='neighbourhood_group',
            y='price',
            hue='room_type')

plt.legend(title='Room Types:')

In [None]:
sns.scatterplot(df,
                x='longitude',
                y='latitude',
                hue='neighbourhood_group')

## Feature Engineering

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
df.head(10)

In [None]:
categorical_columns = ['neighbourhood_group','room_type']

# OneHotEncoded these categorical columns
onehot_encoder = OneHotEncoder(sparse=False, drop='first')

# Apply the encoder to the columns
encoded_df = pd.DataFrame(onehot_encoder.fit_transform(df[categorical_columns]))
encoded_df.columns = onehot_encoder.get_feature_names_out(categorical_columns)

In [None]:
categorical_columns_drop = ['neighbourhood_group','neighbourhood','room_type']

df_featureed = df.drop(columns=categorical_columns_drop)
df_featureed = pd.concat([df_featureed, encoded_df], axis=1)

df_featureed.head()

In [None]:
df_featureed.info()

In [None]:
df['room_type'].unique()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_featureed.corr(), annot=False, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## Feature Engineering (Optional)

In [None]:
categorical_columns_1 = ['neighbourhood_group','neighbourhood','room_type']

# OneHotEncoded these categorical columns
onehot_encoder_1 = OneHotEncoder(sparse=False, drop='first')

# Apply the encoder to the columns
encoded_df_1 = pd.DataFrame(onehot_encoder_1.fit_transform(df[categorical_columns_1]))
encoded_df_1.columns = onehot_encoder_1.get_feature_names_out(categorical_columns_1)

In [None]:
categorical_columns_drop_1 = ['neighbourhood_group','neighbourhood','room_type']

df_featureed_1 = df.drop(columns=categorical_columns_drop_1)
df_featureed_1 = pd.concat([df_featureed_1, encoded_df_1], axis=1)

In [None]:
# Correlation Heatmap
plt.figure(figsize=(120, 80))
sns.heatmap(df_featureed_1.corr(), annot=False, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## Data Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_featureed.drop('price', axis=1)
y = df_featureed['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
split_shapes = {
    "Train X size": X_train.shape,
    "Train y size": y_train.shape,
    "Test X size": X_test.shape,
    "Test y size": y_test.shape
}

split_shapes

## Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
y_pred = linear_model.predict(X_test)

In [None]:
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
mse

In [None]:
r2

## Hypothesis Testing

In [None]:
import statsmodels.api as sm

In [None]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [None]:
model.summary()

## New Modeling

In [None]:
#drop_columns = ['minimum_nights', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'price']

X = df_featureed_1.drop('price', axis=1)
y = df_featureed_1['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
split_shapes = {
    "Train X size": X_train.shape,
    "Train y size": y_train.shape,
    "Test X size": X_test.shape,
    "Test y size": y_test.shape
}

split_shapes

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)

In [None]:
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
mse

In [None]:
r2

In [None]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [None]:
model.summary()

## Extra Modeling

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
#drop_columns = ['minimum_nights', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'price']

X = df_featureed.drop('price', axis=1)
y = df_featureed['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

split_shapes = {
    "Train X size": X_train.shape,
    "Train y size": y_train.shape,
    "Test X size": X_test.shape,
    "Test y size": y_test.shape
}

split_shapes

In [None]:
decision_tree_reg = DecisionTreeRegressor(random_state=101)
decision_tree_reg.fit(X_train, y_train)

In [None]:
y_pred_tree = decision_tree_reg.predict(X_test)

In [None]:
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [None]:
mse_tree

In [None]:
r2_tree