In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [3]:
train = pd.read_csv('train_house_kaggle.csv')
test = pd.read_csv('test_house_kaggle.csv')

In [4]:
# Visualizzare le prime righe del dataset
print(train.head())

# Descrivere le statistiche principali
print(train.describe())

# Controllare i valori mancanti
missing_values = train.isnull().sum()
print(missing_values[missing_values > 0])

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [5]:
# Separare le caratteristiche e il target
X = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']
test_ids = test['Id']
test = test.drop('Id', axis=1)

# Pipeline per i dati numerici
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline per i dati categoriali
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinare le pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creare la pipeline del modello
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

In [6]:
# Dividere il dataset in train e validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Addestrare il modello
model.fit(X_train, y_train)

# Predire sul validation set
y_pred = model.predict(X_valid)

# Valutare il modello
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f'RMSE: {rmse}')

RMSE: 28432.11190271164


In [7]:
# Predire sul test set
predictions = model.predict(test)

# Creare il file di sottomissione
output = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
print("Submission file created!")

Submission file created!


In [10]:
# Specifica il percorso dove vuoi salvare il file
file_path = r'D:\Users\pc\Downloads\submission.csv'  # Usa una stringa raw (r'')

# Salva il file CSV
output.to_csv(file_path, index=False)

# Conferma che il file è stato creato
print(f"Submission file created at {file_path}")

Submission file created at D:\Users\pc\Downloads\submission.csv
