In [38]:
# train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import joblib

# 1. Load data
def load_data():
    df = pd.read_csv('House_Rent_Dataset.csv')
    print("Data loaded successfully!")
    print(f"Total records: {len(df)}")
    return df

# 2. Prepare data
def prepare_data(df):
    # Process 'Floor' column
    # Extract current floor and total floors
    df[['Current Floor', 'Total Floors']] = df['Floor'].str.split(' out of ', expand=True)
    df['Current Floor'] = pd.to_numeric(df['Current Floor'], errors='coerce').fillna(0)
    df['Total Floors'] = pd.to_numeric(df['Total Floors'], errors='coerce').fillna(0)
    
    # Drop the original 'Floor' column
    df.drop(columns=['Floor'], inplace=True)
    
    # Convert categorical columns to numerical
    categorical_columns = ['Area Type', 'Area Locality', 'City', 
                         'Furnishing Status', 'Tenant Preferred', 
                         'Point of Contact']
    
    label_encoders = {}
    for column in categorical_columns:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])
    
    # Select features
    features = ['BHK', 'Size', 'Current Floor', 'Total Floors', 
                'Area Type', 'City', 'Furnishing Status', 'Bathroom']
    X = df[features]
    y = df['Rent']  # Target variable
    
    return X, y, label_encoders, features


# 3. Train model
def train_model(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state=42)
    
    # Create and train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Check accuracy
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    
    print(f"\nModel Performance:")
    print(f"Training Score: {train_score:.2f}")
    print(f"Testing Score: {test_score:.2f}")
    
    return model

if __name__ == "__main__":
    # Load data
    df = load_data()
    
    # Prepare data
    X, y, label_encoders, features = prepare_data(df)
    
    # Train model
    model = train_model(X, y)
    
    # save  model and encoders
    joblib.dump(model, 'house_price_model.joblib')
    joblib.dump(label_encoders, 'label_encoders.joblib')
    joblib.dump(features, 'features.joblib')
    print("\nModel and encoders saved sucessfully!")
    

Data loaded successfully!
Total records: 4746

Model Performance:
Training Score: 0.87
Testing Score: 0.58

Model and encoders saved sucessfully!
