In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.impute import KNNImputer

def explore_data_statistics(df):
    """Explore data statistics."""
    print("Summary Statistics:")
    print(df.describe())

def correlation_analysis(df):
    """Correlation analysis for numerical columns."""
    numeric_data = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_data.corr()
    print("\nCorrelation Matrix:")
    print(correlation_matrix)

def handle_missing_values(df):
    """Handle missing values."""
    missing_values = df.isnull().sum()
    print("\nMissing Values:")
    print(missing_values)

def detect_outliers(df):
    """Detect outliers using Z-score."""
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    z_scores = stats.zscore(df[numerical_cols])
    outlier_indices = np.where(np.abs(z_scores) > 3)
    df_cleaned = df.drop(outlier_indices[0])
    return df_cleaned

def impute_missing_values(df):
    """Impute missing values using KNN imputer."""
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    imputer = KNNImputer(n_neighbors=5)
    df_imputed = pd.DataFrame(imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)
    return df_imputed

def feature_engineering(df):
    """Apply log transformation to numerical features."""
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[numerical_cols] = np.log1p(df[numerical_cols])
    return df

# Load the dataset
df = pd.read_csv("HousePrices-train.csv")

# Perform preprocessing steps
explore_data_statistics(df)
correlation_analysis(df)
handle_missing_values(df)
df = detect_outliers(df)
df = impute_missing_values(df)
df = feature_engineering(df)

# Save the preprocessed data to a CSV file
df.to_csv('preprocessed_dataregression.csv', index=False)


Summary Statistics:
                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066

In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("HousePrices-train.csv")

# Handle missing values
# For numerical columns, impute missing values with median
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# For categorical columns, impute missing values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=categorical_cols)

# Feature engineering: Apply log transformation to numerical features
for col in numerical_cols:
    df[col] = np.log1p(df[col])

# Save the preprocessed data to a CSV file
df.to_csv('preprocessed_dataregression.csv', index=False)
