In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [None]:
train_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')
submission_csv = pd.read_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv')

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
print(train_data.head(5))

In [None]:
print(train_data.columns)

In [None]:
non_numerical = train_data.select_dtypes(exclude=['int', 'float']).columns
print(non_numerical)

In [None]:
# Your 1D array
array = np.array(non_numerical)

# Reshape to 2D array with shape (n_samples, 1)
array_reshaped = array.reshape(-1, 1)

print(array_reshaped)

In [None]:
encoder = OneHotEncoder()
encoder.fit(array_reshaped)

In [None]:
encoded_data = encoder.transform(array_reshaped).toarray()

In [None]:
df_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out
                          (['feature_name']))
print(df_encoded.head(5))

In [None]:
# Remove the prefix 'feature_name_' from the column names
df_encoded.columns = df_encoded.columns.str.replace('feature_name_', '')

# Display the updated DataFrame
print(df_encoded.head())

In [None]:
print(df_encoded.isna().sum())
# no missing value from the df_encoded

In [None]:
print(df_encoded.duplicated().sum())
#no duplication

In [None]:
# Extract numerical columns from train_data
numericals = train_data.select_dtypes(include=['int', 'float']).columns
numerical_columns = train_data[numericals]
print(numerical_columns.head(5))

In [None]:
print(numerical_columns.isna().sum())

In [None]:
numerical_columns = numerical_columns.fillna(0)
print(numerical_columns.isna().sum())

In [None]:
# Concatenate numerical_columns and df_encoded
result = pd.concat([numerical_columns, df_encoded], axis=1)
print(result.head(5))

In [None]:
result = result.replace([np.inf, -np.inf], np.nan).fillna(0)  # Replace inf with NaN and then fill NaN with 0
print(result)


In [None]:
# Check for any remaining invalid values
if result.isnull().any().any() or np.isinf(result.select_dtypes(include=[np.number])).any().any():
    print("Warning: There are still missing or infinite values in the DataFrame.")

print(result.head())

In [None]:
print(result.isna().sum())

In [None]:
correlation_matrix = result.corr()

In [None]:
high_corr_values = correlation_matrix.loc[correlation_matrix['SalePrice'] >= 0.3, 'SalePrice']
print(high_corr_values)

In [None]:
X = result.drop('SalePrice', axis = 1)
y = result['SalePrice']

In [None]:
print(X)

In [None]:
print(y)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA()  # By default, PCA keeps all components
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)

# Cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance)
print("Cumulative Explained Variance:", cumulative_explained_variance)

# Plot explained variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid()
plt.show()

In [None]:
# Retain components that explain 95% of the variance
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of components to retain: {n_components}")

# Re-run PCA with the selected number of components
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Example: Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf_regressor.predict(X_test)

In [None]:
# Evaluate the model
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
output = pd.DataFrame({'Id': range(len(y_pred)), 'SalePrice': y_pred})
output.to_csv('submission.csv', index=False)
