In [None]:
# Data Preparation class 

# Explore the data set 

def analyze_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)

    # Display the first few rows of the DataFrame
    print(df.head())

    # Display the last few rows of the DataFrame
    print(df.tail())

    # Describe the data
    print(df.describe().transpose())

    # Provide info about the data
    print(df.info())

    # Check for missing values
    print(df.isnull().sum())

    # Visualize missing data
    msno.bar(df)

    # Check for duplicate rows
    duplicates = df.duplicated()
    print(f"Number of duplicate rows = {duplicates.sum()}")

    return df

# Call the function with the file path
df = analyze_data('kc_house_data.csv')


# Function to inspect each column for unique values
def inspect_columns(df):
    """Prints unique values of each column in the DataFrame."""
    for col in df.columns:
        print(f"{col}: \n{df[col].unique()}\n")

# Inspect each column for unique values
inspect_columns(df)


In [None]:

# Data Preprocessing Class inherited from  Data Preparation class 

def convert_date_column_and_sqft_basement(df, date_column):
    """Converts a date column into the correct datetime format and 'sqft_basement' to float."""
    df[date_column] = pd.to_datetime(df[date_column], format='%m/%d/%Y')
    df['sqft_basement'] = pd.to_numeric(df['sqft_basement'], errors='coerce')
    return df

# Use the function to convert 'date' column and 'sqft_basement'
df = convert_date_column_and_sqft_basement(df, 'date')


def clean_data(df):
    # List of columns to check for missing values
    columns_to_check = ['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
                        'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 
                        'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

    # Drop rows with missing values in the specified columns
    df = df.dropna(subset=columns_to_check)

    # Replace missing or 0 values in 'yr_renovated' with 'yr_built' values
    mask = (df['yr_renovated'] == 0) | (df['yr_renovated'].isna())
    df.loc[mask, 'yr_renovated'] = df.loc[mask, 'yr_built']

    return df

# Use the function to clean the data
df = clean_data(df)

# Check for missing values in the dataframe 
df.isna().sum()

def add_new_columns(df):
    """Adds new columns 'year', 'month', 'house_age', 'renovation_age', and 'season'."""

    # Year and month from 'date' column
    df['year_sold'] = pd.DatetimeIndex(df['date']).year
    df['month_sold'] = pd.DatetimeIndex(df['date']).month

    # Calculate age of the house when sold
    df['house_age'] = df['year_sold'] - df['yr_built']

    # Calculate years since renovation (if renovated)
    df['renovation_age'] = df['year_sold'] - df['yr_renovated'].fillna(df['year_sold'])

    # Whether renovated or not
    df['renovated'] = (df['yr_renovated'] - df['yr_built']).apply(lambda x: 'Yes' if x > 0 else 'No')

    # Add 'season' column
    df['season'] = df['month_sold'].apply(lambda month: ['Winter', 'Spring', 'Summer', 'Fall'][(month%12 + 3)//3 - 1])

    return df

# Add new columns
df = add_new_columns(df)



def drop_columns(df):
    """Drops 'id', 'zipcode', and 'date' columns."""
    df = df.drop(['id', 'date', 'zipcode'], axis=1)
    return df

# Drop columns
df = drop_columns(df)

df


def create_box_plots(df):
    """Creates box plots for specified columns."""
    columns = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
    fig, axs = plt.subplots(3, 4, figsize=(15, 10))

    for i, column in enumerate(columns):
        sns.boxplot(y=df[column], ax=axs[i//4, i%4])  # Change x to y
        axs[i//4, i%4].set_title(column)

    # Remove empty subplots
    for i in range(len(columns), 12):
        fig.delaxes(axs.flatten()[i])

    plt.tight_layout()

# Create box plots
create_box_plots(df)







In [None]:
# EDA Class  data will 

# Univariate distributions

#  rows with categorical features
def plot_features(df):
    # List of categorical features
    features = ['waterfront', 'view', 'condition', 'grade', 'renovated']

    # Separate features for bar charts and pie charts
    bar_features = ['view', 'condition', 'grade']
    pie_features = ['waterfront', 'renovated']

    # Create a figure and axes for the subplot grid
    fig, axs = plt.subplots(2, 3, figsize=(20, 10))

    # Create a bar chart for each feature in bar_features
    for i, feature in enumerate(bar_features):
        sns.countplot(x=feature, data=df, ax=axs[0, i])
        axs[0, i].set_title(feature)
        axs[0, i].tick_params(axis='x', rotation=90)

    # Create a pie chart for each feature in pie_features
    for i, feature in enumerate(pie_features):
        df[feature].value_counts().plot(kind='pie', ax=axs[1, i], autopct='%1.1f%%')
        axs[1, i].set_title(feature)
        axs[1, i].set_ylabel('')

    # Remove unused axes
    fig.delaxes(axs[1, 2])

    # Adjust the layout
    plt.tight_layout()
    plt.show()

# Call the function
plot_features(df)

# discrete numerical features


# List of features
features = ['bedrooms', 'bathrooms', 'floors', 'year_sold', 'month_sold', 'season']

# Create a figure with 2 rows where the first row has 3 plots and the second row has 3 plots
fig, axs = plt.subplots(2, 3, figsize=(20, 15))

# Flatten the axes array for easy iteration
axs = axs.flatten()

# For each feature, create a bar chart or pie chart
for i, feature in enumerate(features):
    # If the feature is 'year_sold', create a pie chart
    if feature == 'year_sold':
        df[feature].value_counts().sort_index().plot(kind='pie', ax=axs[i], autopct='%1.1f%%')
        axs[i].set_title(f'Distribution of {feature}')
        axs[i].set_ylabel('')
    else:
        # Order the x-axis in ascending order
        order = sorted(df[feature].unique())
        sns.countplot(data=df, x=feature, order=order, ax=axs[i])
        axs[i].set_title(f'Distribution of {feature}')
        # Rotate x-axis labels for better readability
        for item in axs[i].get_xticklabels():
            item.set_rotation(90)
        # If the feature is 'bathrooms', only show every other x-axis label
        if feature == 'bathrooms':
            for ind, label in enumerate(axs[i].get_xticklabels()):
                if ind % 2 == 0:  # only keep even index labels
                    label.set_visible(True)
                else:
                    label.set_visible(False)

# Adjust the layout
plt.tight_layout()
plt.show()


# continuous numerical features



# List of features
features = ['price', 'sqft_living', 'sqft_lot', 'long', 'sqft_living15', 'sqft_lot15', 'house_age', 'renovation_age']

# Create a figure with 2 rows and 4 columns
fig, axs = plt.subplots(2, 4, figsize=(20, 10))

# Flatten the axes array for easy iteration
axs = axs.flatten()

# For each feature, create a histogram
for i, feature in enumerate(features):
    sns.histplot(data=df, x=feature, bins=10, ax=axs[i])
    axs[i].set_title(f'Distribution of {feature}')
    # Rotate x-axis labels for better readability
    for item in axs[i].get_xticklabels():
        item.set_rotation(45)

# Adjust the layout
plt.tight_layout()
plt.show()


# Bivariate analysis 


def plot_scatter(df, y_var):
    # List of features including 'price'
    features = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 
                'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 
                'house_age', 'renovation_age', 'renovated', 'year_sold', 'month_sold', 'season']

    # Calculate the number of rows needed for the plots
    num_rows = (len(features) - 1) // 5  # Subtract 1 because we're not plotting y_var against itself
    if (len(features) - 1) % 5:
        num_rows += 1

    # Create a figure with the calculated number of rows and 5 columns
    fig, axs = plt.subplots(num_rows, 5, figsize=(20, 4 * num_rows))

    # Flatten the axes array for easy iteration
    axs = axs.flatten()

    # For each feature, create a scatter plot against y_var
    j = 0  # Counter for the axes array
    for i, feature in enumerate(features):
        if feature != y_var:  # Avoid plotting a variable against itself
            sns.scatterplot(data=df, x=feature, y=y_var, ax=axs[j])
            axs[j].set_title(f'{y_var} vs {feature}')
            # Rotate x-axis labels for better readability
            for item in axs[j].get_xticklabels():
                item.set_rotation(90)
            j += 1

    # Remove any extra subplots
    for i in range(j, len(axs)):
        fig.delaxes(axs[i])

    # Adjust the layout
    plt.tight_layout()
    plt.show()


# dependent vs independent variables

plot_scatter(df, 'sqft_living')

# sqtft_living Vs All Variables

plot_scatter(df, 'sqft_living')

# house age vs all variables

plot_scatter(df, 'house_age')

# bedrooms vs all variables

plot_scatter(df, 'bedrooms')


#  multivariate analysis


def plot_price_map(df):
    """
    This function creates a scatter plot of real estate prices by location.
    
    Parameters:
    df (DataFrame): The DataFrame containing the real estate data. It should have 'price', 'lat', and 'long' columns.
    """
    
    # Create a new figure with a large size to accommodate the detailed plot
    plt.figure(figsize=(50, 35))

    # Set a maximum price for filtering. We only want to include houses sold for under $2,000,000 in the plot
    max_price = 2000000

    # Filter the data to only include houses sold for under $2,000,000
    filtered_data = df[df["price"] < max_price]

    # Calculate the minimum, maximum, and range of the housing prices in the filtered data
    min_price = min(filtered_data["price"])
    max_price = max(filtered_data["price"])
    range_price = max_price - min_price

    # Create a scatterplot of the data. The x and y coordinates represent the longitude and latitude of each house,
    # and the color represents the price. We use the 'RdYlGn_r' colormap to represent prices from low (red) to high (green).
    plt.scatter(filtered_data["long"], filtered_data["lat"], c=filtered_data["price"], cmap='RdYlGn_r')

    # Add a colorbar to the plot to show the mapping from colors to prices
    cbar = plt.colorbar()
    cbar.set_label("Price", rotation = 90, size = 50)

    # Remove the exponent from the colorbar to make the prices easier to read
    cbar.ax.yaxis.set_major_formatter(ticker.ScalarFormatter(useMathText=False))

    # Set the tick locations and tick labels manually on the colorbar to represent the minimum, 25th percentile,
    # median, 75th percentile, and maximum prices
    cbar.set_ticks([min_price, min_price + range_price * 0.25, min_price + range_price * 0.5, min_price + range_price * 0.75, max_price])
    cbar.set_ticklabels(['${:,.0f}'.format(min_price), '${:,.0f}'.format(min_price + range_price * 0.25), '${:,.0f}'.format(min_price + range_price * 0.5), '${:,.0f}'.format(min_price + range_price * 0.75), '${:,.0f}'.format(max_price)])

    # Set the x axis tick locations to represent the range of longitudes in the data
    plt.xticks(np.arange(-122.5, -121.5 + 0.1, 0.1), rotation = 50, size = 30)

    # Set the y axis tick locations to represent the range of latitudes in the data
    plt.yticks(np.arange(47.2, 47.8 + 0.1, 0.1), rotation = 30, size = 30)

    # Add a title to the plot
    plt.title('Real Estate Prices by Location', size = 75)

    # Add x and y axis labels
    plt.xlabel('Longitude', size = 50)
    plt.ylabel('Latitude', size = 50 )

    # Show the plot
    plt.show()

plot_price_map(df) 
