In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
harishkumardatalab_housing_price_prediction_path = kagglehub.dataset_download('harishkumardatalab/housing-price-prediction')

print('Data source import complete.')


# Housing Price Prediction
The data set which I am using is the housing-price-prediction.
This data set consists of vast feautures one should look into while buying house.
The data set consists of 13 columns namely:
1. Price: A numerical value which consists of price of the house
2. Area: A numerical value of the total carpet area of the house
3. Bedrooms : A numerical value which consists of the total number of bedrooms in the house
4. Bathrooms : A numerical value which consists of the total number of bathrooms in the house
5. Stories : A numerical value which tells the total no of floors of the building
6. Mainroad : A boolean value which gives the idea is the house near to mainroad
7. Guestroom : A boolean value which tells whether the house consists of a guest room or not
8. Basement: A boolean value which tells that whether the house has a basement or not
9. House Water Heating: A boolean value which tells about the hot water supply in the house
10. Air conditioning : A boolean value which tells is the house provided with air conditioning or not
11. Parking: A numerical value which tells the no of parking spaces available for the house
12. Prefarea: A boolean value which tells the house is located in a preffered Area or not
13. furnishingstatus: A string value (furnished/semi - furnished/unfurnished) to know about the furnishing status of the house

# Libraries required
1. numpy for numerical operations
2. pandas for data storage and data manipulation
3. seaborn as an extension for matplotlib for plotting graphs
4. matplotlib for plotting graphs

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 📁 Carga Robusta de Datos

Convertimos el archivo CSV a un DataFrame de pandas usando una función que busca en múltiples ubicaciones posibles para mayor flexibilidad y manejo de errores.

**📝 Nota importante**: Si planeas usar la descarga de Kaggle, ejecuta primero la celda 1 para descargar el dataset y definir la variable de ruta.

In [7]:
import pandas as pd
import os

# Función para cargar datos desde diferentes ubicaciones posibles
def cargar_datos_housing():
    """
    Función para cargar datos de housing desde diferentes ubicaciones
    """
    rutas_posibles = [
        r"C:\Users\Dell\PyhtonIA\Pandas_Python_IA\kagggle\Housing.csv",
        r"C:\Users\Dell\PyhtonIA\Pandas_Python_IA\kaggle\Housing.csv", 
        "Housing.csv"
    ]
    
    # Intentar agregar la ruta de Kaggle si está disponible
    try:
        if 'harishkumardatalab_housing_price_prediction_path' in globals():
            kaggle_path = os.path.join(harishkumardatalab_housing_price_prediction_path, "Housing.csv")
            rutas_posibles.append(kaggle_path)
    except NameError:
        pass  # Si no está definida, simplemente continuar
    
    # Intentar cargar desde las rutas conocidas
    for ruta in rutas_posibles:
        try:
            if os.path.exists(ruta):
                data = pd.read_csv(ruta)
                print(f"✅ Datos cargados desde: {ruta}")
                return data
        except Exception as e:
            print(f"⚠️ Error al cargar desde {ruta}: {str(e)}")
            continue
    
    # Si no se encuentra, intentar descargar desde Kaggle automáticamente
    print("🔄 Intentando descargar dataset desde Kaggle...")
    try:
        import kagglehub
        kaggle_path = kagglehub.dataset_download('harishkumardatalab/housing-price-prediction')
        csv_path = os.path.join(kaggle_path, "Housing.csv")
        
        if os.path.exists(csv_path):
            data = pd.read_csv(csv_path)
            print(f"✅ Datos descargados y cargados desde Kaggle: {csv_path}")
            # Guardar la variable global para uso futuro
            globals()['harishkumardatalab_housing_price_prediction_path'] = kaggle_path
            return data
    except Exception as e:
        print(f"❌ Error al descargar desde Kaggle: {str(e)}")
    
    print("❌ No se pudo cargar el archivo desde ninguna ubicación")
    print("💡 Asegúrate de que el archivo Housing.csv esté en alguna de estas ubicaciones:")
    for ruta in rutas_posibles:
        print(f"   • {ruta}")
    return None

# Cargar los datos
data = cargar_datos_housing()

# Mostrar primeras filas si la carga fue exitosa
if data is not None:
    print(f"\n📊 Dimensiones del dataset: {data.shape}")
    print(f"📋 Columnas: {list(data.columns)}")
    data.head()
else:
    print("❌ Error: No se pudieron cargar los datos")

✅ Datos cargados desde: C:\Users\Dell\PyhtonIA\Pandas_Python_IA\kagggle\Housing.csv

📊 Dimensiones del dataset: (545, 13)
📋 Columnas: ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']


# Basic Information

In [None]:
data.info()

## 🔎 Insight:
* The total entries are 545(no of rows)
* There are total 13 no of feautures

In [None]:
data.describe(include = "object")

In [None]:
data.isnull().sum().sort_values(ascending=False)

# Univariate analysis

Analysing one variable at a time

In [None]:
feautures = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical = ['area','bedrooms','bathrooms','stories','parking']

for col in feautures:
    sns.countplot(x=col, data=data, palette='pastel')
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

Insight:

* Maximum houses are connected to main road.
* Maximum houses do not have a guestroom.
* Maximum houses do not have a basement.
* Maximum houses do not have hot water heating facility.
* Maximum houses do not have air conditioning facility.
* Maximum houses are not situated in preferred area.
* Maximum houses are semi-furnished.

In [None]:
sns.histplot(data['area'].dropna(),kde = True)
plt.title('Area distribution')
plt.show()

In [None]:
for col in numerical:
    if col != 'area':
      sns.countplot(x=col, data=data, palette='pastel')
      plt.title(f'Distribution of {col}')
      plt.show()

# Insights
* Most of the houses lie in the range of 3000-7000 sq.ft
* Most of the houses have 3 bedrooms
* Most of the houses have 2 stories
* Most of the houses have 1 bathroom
* Most of the houses have no parking spaces

# Bivariate analysis

Analysing tha data using realtionship between two variables

In [None]:
sns.histplot(data['price'].dropna(), kde=True, bins=30)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.scatterplot(x='area', y='price', data=data)
plt.title('Area vs Price')
plt.xlabel('Area (sq.ft)')
plt.ylabel('Price')
plt.show()

# Insights
* Many houses are clustered in the cheap - moderate range
* After that the pricesof the houses increase rapidly due the dependences of multiple features

In [None]:
sns.countplot(x='parking',hue='mainroad',data = data, palette = 'pastel')
plt.title('Parking vs main road')
plt.show()

# Insights
* Most of the houses irrespective of the location to the main road does not have parking spaces.
* We can see clearly from the graph that more the houses are nearer to the main road more is the availibility of the parking spaces

# Outlier Detection

Using boxplots to detect Outliers

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['price'])
plt.title('Price Distribution - Outlier Detection')
plt.xlabel('Price')
plt.show()

# Estadísticas de outliers
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data['price'] < lower_bound) | (data['price'] > upper_bound)]
print(f"📊 Outliers en Price: {len(outliers)} de {len(data)} observaciones ({(len(outliers)/len(data)*100):.1f}%)")

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['area'])
plt.title('Area Distribution - Outlier Detection')
plt.xlabel('Area (sq.ft)')
plt.show()

# Estadísticas de outliers
Q1 = data['area'].quantile(0.25)
Q3 = data['area'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data['area'] < lower_bound) | (data['area'] > upper_bound)]
print(f"📊 Outliers en Area: {len(outliers)} de {len(data)} observaciones ({(len(outliers)/len(data)*100):.1f}%)")

# Correlation Analysis

Analysing the bond between two features

In [None]:
copy = data.copy()

# Binary mappings
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for col in binary_cols:
    copy[col] = copy[col].map({'yes': 1, 'no': 0})

# Map furnishing status
copy['furnishingstatus'] = copy['furnishingstatus'].map({'furnished': 2, 'semi-furnished': 1, 'unfurnished': 0})

sns.heatmap(copy.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()


# Insights
* Price shows a strong positive correlation with area (0.54), bathrooms (0.52), and air conditioning (0.45), while it has a noticeable negative correlation with the basement feature (-0.19).

* Area is moderately linked to parking (0.35) and price (0.54), whereas the remaining variables exhibit generally weak interrelationships.

* There are no strong negative correlations in the dataset—most associations are either weak or positively inclined, with the exception of the negative link between basement and price.


# Multivaraiate Analysis

Analysing oone variable with multiple varaible at the same time

In [None]:
features = ["price", "area", "bedrooms", "bathrooms", "stories", "parking"]
sns.pairplot(data[features])
plt.show()

# Conclusion
* House prices are primarily influenced by the area, with secondary contributions from factors such as bathrooms, air conditioning, and parking.
* The data reveals noticeable outliers in both price and area, while the majority of homes feature modest attributes like 2–4 bedrooms, 1–2 bathrooms, and limited parking.
* Categorical variables exhibit weak correlations with price, emphasizing the greater impact of numeric features in determining a property's value.

