<a href="https://colab.research.google.com/github/galitneu/auto-eda-tool/blob/main/%D7%A9%D7%90%D7%A8%D7%99%D7%95%D7%AA_%D7%97%D7%99%D7%A9%D7%95%D7%91_%D7%A2%D7%9D_%D7%90%D7%99%D7%A0%D7%A4%D7%9C%D7%A6%D7%99%D7%94_%D7%A2%D7%95%D7%A0%D7%AA%D7%99%D7%95%D7%AA_2_%D7%9E%D7%95%D7%93%D7%9C%D7%99%D7%9D_%D7%A9%D7%9C_%D7%A8%D7%92%D7%A8%D7%A1%D7%99%D7%94_%D7%9E%D7%93%D7%99%D7%A0%D7%94_%D7%9E%D7%95%D7%93%D7%9C_%D7%9C%D7%95%D7%9E%D7%93_%D7%98%D7%A2%D7%95%D7%99%D7%95%D7%AA_9019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- שלב 0: ייבוא כל הספריות הנדרשות ---
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from google.colab import drive

# --- שלב 1: הגדרות וטעינת נתונים ---
print("--- שלב 1: טוען נתונים ---")
drive.mount('/content/drive', force_remount=True)
DRIVE_PATH = '/content/drive/MyDrive/KaggleProject/'

try:
    df_train_raw = pd.read_csv(f'{DRIVE_PATH}Train.csv', low_memory=False, parse_dates=['saledate'])
    df_valid_raw = pd.read_csv(f'{DRIVE_PATH}Valid.csv', low_memory=False, parse_dates=['saledate'])
    print("נתונים נטענו בהצלחה.")
except FileNotFoundError:
    print(f"שגיאה: ודא שהקבצים 'Train.csv' ו-'Valid.csv' נמצאים בתיקייה: {DRIVE_PATH}")
    exit()

# --- שלב 2: עיבוד נתונים והנדסת מאפיינים (מקוצר - מכיל את כל השיפורים) ---
print("\n--- שלב 2: מבצע עיבוד נתונים והנדסת מאפיינים ---")
# (הערה: זהו אותו קוד עיבוד מהפעם הקודמת, הוא רק מוצג כאן בצורה מקוצרת)
df_train_raw = df_train_raw[df_train_raw['saledate'].dt.year >= 2000].copy()
df_valid_raw = df_valid_raw[df_valid_raw['saledate'].dt.year >= 2000].copy()
cpi_data_real = {2000: 172.2, 2001: 177.1, 2002: 179.9, 2003: 184.0, 2004: 188.9, 2005: 195.3, 2006: 201.6, 2007: 207.3, 2008: 215.3, 2009: 214.5, 2010: 218.1, 2011: 224.9, 2012: 229.6}
ADJUSTMENT_YEAR = 2012
inflation_multiplier = {year: cpi_data_real[ADJUSTMENT_YEAR] / cpi for year, cpi in cpi_data_real.items()}
df_train_raw['SalePrice_adj'] = df_train_raw.apply(lambda row: row['SalePrice'] * inflation_multiplier.get(row['saledate'].year, 1), axis=1)
train_original_prices = df_train_raw[['SalePrice', 'SalePrice_adj']].copy()
df_train_raw['source'] = 'train'
df_valid_raw['source'] = 'valid'
df_combined_temp = pd.concat([df_train_raw.drop(['SalePrice', 'SalePrice_adj'], axis=1), df_valid_raw], ignore_index=True)
min_date = df_combined_temp['saledate'].min()
for df in [df_combined_temp]:
    df['saleYear'] = df['saledate'].dt.year
    df['saleMonth'] = df['saledate'].dt.month
    df['DayOfYear'] = df['saledate'].dt.dayofyear
    df['DaysFromStart'] = (df['saledate'] - min_date).dt.days
df_train_raw = df_combined_temp[df_combined_temp['source'] == 'train'].drop('source', axis=1).copy()
df_valid_raw = df_combined_temp[df_combined_temp['source'] == 'valid'].drop('source', axis=1).copy()
df_train_raw = pd.concat([df_train_raw.reset_index(drop=True), train_original_prices.reset_index(drop=True)], axis=1)
early_season_train_df = df_train_raw[df_train_raw['saleMonth'].isin([1, 2, 3, 4])].copy()
rest_of_year_train_df = df_train_raw[~df_train_raw['saleMonth'].isin([1, 2, 3, 4])].copy()
trend_model_early = LinearRegression()
trend_model_early.fit(early_season_train_df[['DaysFromStart']], early_season_train_df['SalePrice_adj'])
trend_model_rest = LinearRegression()
trend_model_rest.fit(rest_of_year_train_df[['DaysFromStart']], rest_of_year_train_df['SalePrice_adj'])
def predict_seasonal_trend(df):
    early_mask = df['saleMonth'].isin([1, 2, 3, 4])
    predictions = pd.Series(index=df.index, dtype=float)
    if early_mask.sum() > 0:
        predictions.loc[early_mask] = trend_model_early.predict(df.loc[early_mask, ['DaysFromStart']])
    if (~early_mask).sum() > 0:
        predictions.loc[~early_mask] = trend_model_rest.predict(df.loc[~early_mask, ['DaysFromStart']])
    return predictions
df_train_raw['SalePrice_Trend'] = predict_seasonal_trend(df_train_raw)
df_valid_raw['SalePrice_Trend'] = predict_seasonal_trend(df_valid_raw)
df_train_raw['SalePrice_Residual'] = df_train_raw['SalePrice_adj'] - df_train_raw['SalePrice_Trend']
train_labels_residual = df_train_raw['SalePrice_Residual'].copy()
df_train_proc = df_train_raw.drop(['SalePrice', 'SalePrice_adj', 'SalePrice_Trend', 'SalePrice_Residual', 'saledate', 'SalesID', 'MachineID'], axis=1)
df_train_proc['source'] = 'train'
df_valid_proc = df_valid_raw.drop(['SalePrice_Trend', 'saledate', 'SalesID', 'MachineID'], axis=1)
df_valid_proc['source'] = 'valid'
df_combined = pd.concat([df_train_proc, df_valid_proc], ignore_index=True, sort=False)
year_made_by_model = df_combined.groupby('fiModelDesc')['YearMade'].median().astype(int)
rows_to_fix_idx = df_combined[df_combined['YearMade'] == 1000].index
imputed_years = df_combined.loc[rows_to_fix_idx, 'fiModelDesc'].map(year_made_by_model)
global_median_year = df_combined.loc[df_combined['YearMade'] != 1000, 'YearMade'].median()
imputed_years = imputed_years.fillna(global_median_year)
df_combined.loc[rows_to_fix_idx, 'YearMade'] = imputed_years.values
df_combined['machineAge'] = df_combined['saleYear'] - df_combined['YearMade']
df_combined['MachineHoursCurrentMeter_is_missing'] = df_combined['MachineHoursCurrentMeter'].isnull() | (df_combined['MachineHoursCurrentMeter'] == 0)
df_combined['MachineHoursCurrentMeter'].replace(0, np.nan, inplace=True)
imputed_hours = df_combined.groupby('fiModelDesc')['MachineHoursCurrentMeter'].transform(lambda x: x.fillna(x.median()))
df_combined['MachineHoursCurrentMeter'] = imputed_hours
if df_combined['MachineHoursCurrentMeter'].isnull().sum() > 0:
    df_combined['MachineHoursCurrentMeter'].fillna(df_combined['MachineHoursCurrentMeter'].median(), inplace=True)
if df_combined['auctioneerID'].isnull().sum() > 0:
    df_combined['auctioneerID_is_missing'] = df_combined['auctioneerID'].isnull()
    df_combined['auctioneerID'] = df_combined['auctioneerID'].fillna(df_combined['auctioneerID'].median())
source_col = df_combined['source']
df_combined = df_combined.drop('source', axis=1)
for col_name in df_combined.columns:
    if pd.api.types.is_object_dtype(df_combined[col_name]):
        df_combined[col_name] = df_combined[col_name].fillna('missing').astype('category').cat.codes
df_combined['source'] = source_col
print("עיבוד הנתונים הושלם.")

# --- שלב 3: אימון מודל ראשי ומודל תיקון שגיאות ---
print("\n" + "="*50)
print("--- שלב 3: אימון מודל ראשי ומודל מתקן שגיאות ---")

# 3.1 הכנת נתונים ופיצול עונתי
df_train_processed = df_combined[df_combined['source'] == 'train'].drop('source', axis=1).copy()
df_train_processed['SalePrice_Residual'] = train_labels_residual.values
df_train_processed['SalePrice_adj'] = df_train_raw['SalePrice_adj'].values
df_train_processed['saledate'] = df_train_raw['saledate']
train_time_split = df_train_processed[df_train_processed['saledate'] < '2011-01-01'].copy()
validation_mask = (df_train_processed['saledate'] >= '2011-01-01') & (df_train_processed['saledate'] < '2011-05-01')
val_time_split = df_train_processed[validation_mask].copy()
X_train_time = train_time_split.drop(['SalePrice_Residual', 'SalePrice_adj', 'saledate'], axis=1)
y_train_time_residual = train_time_split['SalePrice_Residual']
X_val_time = val_time_split.drop(['SalePrice_Residual', 'SalePrice_adj', 'saledate'], axis=1)
y_val_actual_price = val_time_split['SalePrice_adj']
val_time_trend = df_train_raw.loc[val_time_split.index, 'SalePrice_Trend'].values
X_val_time = X_val_time[X_train_time.columns]

# 3.2 אימון המודל הראשי (initial_model)
print("\nמאמן מודל ראשי על נתוני 2000-2010...")
user_params = {'n_estimators': 150, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 4, 'max_features': 0.5, 'n_jobs': -1, 'random_state': 42}
initial_model = RandomForestRegressor(**user_params)
initial_model.fit(X_train_time, y_train_time_residual)

# 3.3 חישוב השגיאות של המודל הראשי על סט הוולידציה
initial_val_preds_residual = initial_model.predict(X_val_time)
initial_val_preds_full_price = val_time_trend + initial_val_preds_residual
errors_on_validation = y_val_actual_price - initial_val_preds_full_price

# 3.4 אימון מודל לתיקון שגיאות
print("\nמאמן מודל לתיקון שגיאות...")
error_correction_model = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, random_state=42, n_jobs=-1, max_depth=10)
X_val_for_correction = X_val_time.copy()
X_val_for_correction['initial_prediction'] = initial_val_preds_full_price
error_correction_model.fit(X_val_for_correction, errors_on_validation)

# 3.5 הערכת הביצועים של המודל המשולב
predicted_error_correction_val = error_correction_model.predict(X_val_for_correction)
final_corrected_price_val = initial_val_preds_full_price + predicted_error_correction_val

def rmse(y_true, y_pred): return np.sqrt(mean_squared_error(y_true, y_pred))
def rmsle(y_true, y_pred): return np.sqrt(mean_squared_log_error(y_true, np.maximum(y_pred, 1)))

final_val_rmse = rmse(y_val_actual_price, final_corrected_price_val)
final_val_rmsle = rmsle(y_val_actual_price, final_corrected_price_val)
print("\n--- ביצועי המודל המשולב על סט הוולידציה העונתי ---")
print(f"סט וולידציה (Validation Set - ינואר-אפריל 2011):")
print(f"\tRMSE:  ${final_val_rmse:,.2f}")
print(f"\tRMSLE: {final_val_rmsle:.4f}\n")


# --- שלב 4: אימון המודלים הסופיים על כל הנתונים ---
print("\n" + "="*50)
print("--- שלב 4: מאמן את המודלים הסופיים על כל הנתונים ---")

# 4.1 אימון המודל הראשי הסופי
X_full_train = df_train_processed.drop(['SalePrice_Residual', 'SalePrice_adj', 'saledate'], axis=1)
y_full_train_residual = df_train_processed['SalePrice_Residual']
final_model = RandomForestRegressor(**user_params)
final_model.fit(X_full_train, y_full_train_residual)
print("אימון המודל הראשי הסופי הושלם.")

# 4.2 אימון מודל תיקון שגיאות סופי
# (כאן נאמן אותו על כל סט האימון, כי אין לנו סט וולידציה נפרד)
print("\nמאמן מודל תיקון שגיאות סופי...")
full_train_preds_residual = final_model.predict(X_full_train)
full_train_trend = df_train_raw['SalePrice_Trend'].values
full_train_preds_price = full_train_trend + full_train_preds_residual
full_train_errors = df_train_raw['SalePrice_adj'].values - full_train_preds_price
X_full_for_correction = X_full_train.copy()
X_full_for_correction['initial_prediction'] = full_train_preds_price
final_error_correction_model = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, random_state=42, n_jobs=-1, max_depth=10)
final_error_correction_model.fit(X_full_for_correction, full_train_errors)
print("אימון מודל התיקון הסופי הושלם.")

# --- שלב 5: יצירת קובץ הגשה סופי ---
print("\n--- שלב 5: יוצר קובץ הגשה סופי ---")
df_valid_processed = df_combined[df_combined['source'] == 'valid'].drop('source', axis=1).copy()
df_valid_processed_aligned = df_valid_processed[X_full_train.columns]

# 5.1 חיזוי בסיסי
final_base_residuals = final_model.predict(df_valid_processed_aligned)
final_base_price = df_valid_raw['SalePrice_Trend'].values + final_base_residuals

# 5.2 חיזוי התיקון
X_submission_for_correction = df_valid_processed_aligned.copy()
X_submission_for_correction['initial_prediction'] = final_base_price
predicted_error_correction = final_error_correction_model.predict(X_submission_for_correction)

# 5.3 חישוב החיזוי הסופי
final_corrected_price = final_base_price + predicted_error_correction
final_price_pred_floored = np.maximum(final_corrected_price, 1000)

df_submission = pd.DataFrame({'SalesID': df_valid_raw['SalesID'], 'SalePrice': final_price_pred_floored})
submission_filename = f'{DRIVE_PATH}submission_ErrorCorrected_Final.csv'
df_submission.to_csv(submission_filename, index=False)
print(f"\nקובץ ההגשה '{submission_filename}' נשמר בהצלחה!")
print("5 השורות הראשונות בקובץ ההגשה:")
print(df_submission.head())

--- שלב 1: טוען נתונים ---
Mounted at /content/drive
נתונים נטענו בהצלחה.

--- שלב 2: מבצע עיבוד נתונים והנדסת מאפיינים ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined['MachineHoursCurrentMeter'].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined['MachineHoursCurrentMeter'].fillna(df_combined['MachineHoursCurrentMeter'].median(), inplace=True)


עיבוד הנתונים הושלם.

--- שלב 3: אימון מודל ראשי ומודל מתקן שגיאות ---

מאמן מודל ראשי על נתוני 2000-2010...

מאמן מודל לתיקון שגיאות...

--- ביצועי המודל המשולב על סט הוולידציה העונתי ---
סט וולידציה (Validation Set - ינואר-אפריל 2011):
	RMSE:  $6,363.82
	RMSLE: 0.1835


--- שלב 4: מאמן את המודלים הסופיים על כל הנתונים ---
אימון המודל הראשי הסופי הושלם.

מאמן מודל תיקון שגיאות סופי...
אימון מודל התיקון הסופי הושלם.

--- שלב 5: יוצר קובץ הגשה סופי ---

קובץ ההגשה '/content/drive/MyDrive/KaggleProject/submission_ErrorCorrected_Final.csv' נשמר בהצלחה!
5 השורות הראשונות בקובץ ההגשה:
        SalesID     SalePrice
313947  1222837  59997.080349
313948  1222839  68022.675621
313949  1222841  33913.956601
313950  1222843  16420.173583
313951  1222845  46604.514872
