In [1]:
import pandas as pd
import requests
from tqdm import tqdm

# Function to check if image URL loads successfully with retries and user-agent
def check_image_loadability(image_url):
    try:
        # Retry up to 3 times with timeout of 10 seconds
        for _ in range(3):
            response = requests.get(image_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
            if response.status_code == 200:
                return True
    except Exception as e:
        print(f"Error loading image at {image_url}: {e}")
    return False

# Load a subset of your existing dataset from CSV
file_path = 'world_real_estate_data.csv'
df = pd.read_csv(file_path)

# Assuming 'image' column exists in your dataset
subset_size = 1000  # Adjust as needed
subset_df = df.head(subset_size)  # Take the first subset_size rows as a subset

image_urls = subset_df['image'].tolist()

# List to store loadability results
image_loadable = []

# Process each image URL
for image_url in tqdm(image_urls):
    loadable = check_image_loadability(image_url)
    image_loadable.append(loadable)

# Add 'image_loadable' column to the subset DataFrame
subset_df['image_loadable'] = image_loadable

# Display the subset DataFrame with integrated feature
print(subset_df.head())

# Save the subset DataFrame to a new CSV file
subset_df.to_csv('subset_dataset.csv', index=False)


  8%|███████▎                                                                                     | 78/1000 [01:00<12:00,  1.28it/s]

KeyboardInterrupt



In [3]:
import re
# Drop rows where price_in_USD is missing
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
subset_df['title'] = label_encoder.fit_transform(subset_df['title'])
subset_df['country'] = label_encoder.fit_transform(subset_df['country'])
subset_df['location'] = label_encoder.fit_transform(subset_df['location'])
print(subset_df.head())

numeric_columns = ['apartment_total_area','apartment_living_area']
for col in numeric_columns:
    subset_df[col] = subset_df[col].apply(lambda x: float(re.findall(r'\d+', str(x))[0]) if pd.notnull(x) else x)

   title  country  location  building_construction_year  \
0   8475       23      3843                         NaN   
1  47213       23      2548                      2021.0   
2   5206       23      3826                         NaN   
3    637       22      1276                      2020.0   
4   7077       22      1276                      2026.0   

   building_total_floors  apartment_floor  apartment_rooms  \
0                    5.0              1.0              3.0   
1                    2.0              NaN              NaN   
2                    5.0              2.0              2.0   
3                   15.0              5.0              2.0   
4                    8.0              3.0              3.0   

   apartment_bedrooms  apartment_bathrooms apartment_total_area  \
0                 2.0                  2.0               120 m²   
1                 NaN                  NaN               500 m²   
2                 1.0                  1.0                65 m²   
3   

In [4]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Assuming subset_df is your DataFrame with no missing 'price_in_USD'
subset_df = subset_df.dropna(subset=['price_in_USD'])

# Columns used for similarity
similarity_columns = ['country', 'location', 'price_in_USD']

# Separate rows with and without missing values
missing_mask = subset_df.isnull().any(axis=1)
rows_with_missing = subset_df[missing_mask]
rows_without_missing = subset_df[~missing_mask]

# Prepare data for NearestNeighbors
X_train = rows_without_missing[similarity_columns]
X_test = rows_with_missing[similarity_columns]

# Initialize NearestNeighbors model
nn = NearestNeighbors(n_neighbors=1, metric='euclidean')

# Fit the model on rows without missing values
nn.fit(X_train)



In [None]:
# Function to fill missing values
def fill_missing_values(row, model, filled_data):
    row = row.copy()  # Create a copy to avoid SettingWithCopyWarning
    
    # Extract the valid part of the row
    valid_row = row[similarity_columns].values.reshape(1, -1)
    
    # Find nearest neighbor and get the corresponding row
    _, nearest_idx = model.kneighbors(valid_row)
    nearest_row = filled_data.iloc[nearest_idx[0][0]]
    
    # Fill missing values in the original row with values from the nearest row
    for col in filled_data.columns:
        if pd.isnull(row[col]):
            row[col] = nearest_row[col]
    
    return row

# Apply the function to fill missing values
filled_rows = rows_with_missing.apply(lambda row: fill_missing_values(row, nn, rows_without_missing), axis=1)

# Combine the filled rows with the rows that didn't have missing values
filled_subset_df = pd.concat([rows_without_missing, filled_rows])

# Display the filled DataFrame
print(filled_subset_df)




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Separate features and target variable
X = filled_subset_df.drop(columns=['price_in_USD'])
y = filled_subset_df['price_in_USD']

# Apply one-hot encoding to categorical columns (if any)
X = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest Mean Absolute Error: {mae}")
print(f"Random Forest Mean Squared Error: {mse}")
print(f"Random Forest R² Score: {r2}")


In [None]:
filled_subset_df.drop(columns=['image_loadable'], inplace=True)

In [None]:
import xgboost as xgb

# Assuming filled_subset_df is already prepared as before

# Separate features and target variable
X = filled_subset_df.drop(columns=['price_in_USD', 'image', 'url'])  # Drop non-numeric columns for simplicity
y = filled_subset_df['price_in_USD']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV for XGBoost
param_grid_xgb = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'n_estimators': [100, 200, 300]
}

xgb_model = xgb.XGBRegressor(random_state=42)
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring='r2', n_jobs=-1)
grid_search_xgb.fit(X_train_scaled, y_train)

best_params_xgb = grid_search_xgb.best_params_
print("Best XGBoost Hyperparameters:", best_params_xgb)

# Train an XGBoost model with the best parameters
best_xgb_model = xgb.XGBRegressor(**best_params_xgb, random_state=42)
best_xgb_model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred_xgb = best_xgb_model.predict(X_test_scaled)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Mean Absolute Error: {mae_xgb}")
print(f"XGBoost Mean Squared Error: {mse_xgb}")
print(f"XGBoost R² Score: {r2_xgb}")


In [None]:
import numpy as np
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm

from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.applications.resnet_v2 import preprocess_input

# Assuming filled_subset_df is your DataFrame with 'image' URLs
image_features = []

# Load pre-trained ResNet50V2 model
base_model = ResNet50V2(weights='imagenet', include_top=False, pooling='avg')

# Function to preprocess and extract image features
def extract_image_features(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img = img.resize((224, 224))  # Resize image as required by ResNet50V2
        img_array = np.array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features = base_model.predict(img_array)
        return features.flatten()  # Flatten to a 1D vector
    except Exception as e:
        print(f"Error processing image from {url}: {e}")
        return np.zeros(2048)  # Return zeros if unable to process image

# Iterate over each image URL and extract features
for url in tqdm(filled_subset_df['image']):
    features = extract_image_features(url)
    image_features.append(features)

# Convert image_features to DataFrame and concatenate with filled_subset_df
image_features_df = pd.DataFrame(image_features, columns=[f'image_feat_{i}' for i in range(2048)])
filled_subset_df = pd.concat([filled_subset_df.reset_index(drop=True), image_features_df], axis=1)

# Now filled_subset_df contains additional image features
print(filled_subset_df.head())


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define a simplified parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Create a model instance
rf_model = RandomForestRegressor(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

# Get best parameters
best_params_rf = grid_search.best_params_
print("Best Random Forest Hyperparameters:", best_params_rf)


In [None]:
# Train final Random Forest model with the best parameters
best_rf_model = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf_model.fit(X_train_selected, y_train)

# Predictions and evaluation
y_pred = best_rf_model.predict(X_test_selected)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest Mean Absolute Error: {mae}")
print(f"Random Forest Mean Squared Error: {mse}")
print(f"Random Forest R² Score: {r2}")

# End timing the script
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
