In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor # Updated import
from sklearn.metrics import accuracy_score,mean_squared_error,f1_score

# Load datasets
train_data = pd.read_csv('/kaggle/input/smart-urban-analytics/urban_development_dataset.csv')
test_data = pd.read_csv('/kaggle/input/smart-urban-analytics/urban_development_test_data.csv')

# Identify common columns between training and test datasets
common_columns = train_data.columns.intersection(test_data.columns)
    
# Select only the features present in both datasets
# Ensure 'development_trend_score' is excluded if present
if 'development_trend_score' in common_columns:
    feature_columns = common_columns.drop('development_trend_score')
else:
    feature_columns = common_columns

X = train_data[feature_columns]
y = train_data['development_trend_score']

test_features = test_data[feature_columns]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
test_data_imputed = imputer.transform(test_features)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
test_data_scaled = scaler.transform(test_data_imputed)

# Convert target variable to integer (if not already categorical)
y_class = y.astype(int)



## using GridSearch to get the best parameters

In [None]:
start_time = time.time()
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y_class, random_state=100, test_size=0.6)
        
    #initializing the random forest
randomforest = RandomForestClassifier(random_state=60,max_depth=None,min_samples_leaf= 1, min_samples_split= 5, n_estimators= 100)


    #using hyperparameter tunning 
param_grid = {
   'n_estimators': [100,150],  
   'max_depth': [None, 3, 5],    
   'min_samples_split': [2, 5, 10],  
   'min_samples_leaf': [1, 2, 4],  
   'bootstrap': [True, False] 
}

    #initiallizing the modal
grid_search = GridSearchCV(estimator=randomforest, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=2, n_jobs=-1)

    # fitting the gridsearch
grid_search.fit(X_train, y_train)
        
    #printing the best result
best_params = grid_search.best_params_
print('for',X_scaled,'best parameters are',best_params)

    #getting the score
print('best score is',grid_search.best_score_)
        
    #evaluatign the test
print('test score is',grid_search.score(X_test,y_test))

    #evaluate the modal 
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
        
    # training the modal with the best parameters
randomforest = RandomForestClassifier(**best_params,random_state=60,max_depth=None,min_samples_leaf= 1, min_samples_split= 5, n_estimators= 100)
        
    #training the modal
randomforest.fit(X_train,y_train)
        
    #prediciting
prediction = randomforest.predict(X_test)
print('the predictions:',pd.DataFrame(prediction).value_counts())
        
    #getting the f1 value
f1 = f1_score(y_test, prediction,average='macro')
        
    #accuraccy and mse 
score = accuracy_score(y_test,prediction)
mse = mean_squared_error(y_test,prediction)
        
print('score =',score,'mse =',mse)
print('')

end_time = time.time()
elapsed_time = end_time - start_time
print("it took",elapsed_time,"seconds to execute")

## using xgbosst

In [None]:
import xgboost as xgb

# Load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_test_mode = X_test.copy()
columns = X_test.columns

# filling the nulls using the mode
for c in columns:
    X_test_mode.fillna(X_test_mode[c].mode()[0],inplace=True)

# Convert to DMatrix (optimized data structure for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {
    'booster': 'gbtree',
    'max_depth': 3,                 
    'learning_rate': 0.01,          
    'n_estimators': 100,           
    'subsample': 0.8,                
    'colsample_bytree': 0.8,         
    'gamma': 5  ,                 
    'reg_alpha': 1,                
    'reg_lambda': 3,               
}

# Train the model
model = xgb.train(params, dtrain,num_boost_round=1 )

X_train_rf = dtrain.get_data()
y_train_rf = dtrain.get_label()

# using the random forest 
rf = RandomForestRegressor(n_estimators=100,random_state = 60,min_samples_split=5)
rf.fit(X_train_rf,y_train_rf)

# Predict
y_pred = rf.predict(X_test_mode)
y_pred = np.round(y_pred)

print("MSE:", mean_squared_error(y_test, y_pred))
print('score:',accuracy_score(y_test, y_pred))


## evaluation

In [None]:
# getting the testclass
#test_data_scaled = xgb.DMatrix(test_data)
for c in columns:
    test_data.fillna(test_data[c].mode()[0],inplace=True)

# Predict on the test dataset
test_class_predictions = rf.predict(test_data)
#test_class_predictions = model.predict(test_data_scaled)

test_class_predictions = np.round(test_class_predictions)

# Prepare the submission file
submission = pd.DataFrame({
    "ID": test_data.index + 1,  # Assuming IDs are 1-based indices
    "development_trend_score": test_class_predictions
})

# Save the submission file
submission_file_path = '/kaggle/working/submission.csv'
submission.to_csv(submission_file_path, index=False)
print(f"Submission file saved at: {submission_file_path}")