In [None]:
#1.Reading the target and type of regression to be run

import json

#recursive fn for iterting through each structre.
def get_val_json(data,keys):
  if len(keys) == 1:
    return data.get(keys[0])
  else:
    new_key = keys[0]
    if new_key in data:
      return get_val_json(data[new_key],keys[1:])
    else:
      return None

# Load JSON data from the file
with open('alogparams_from_ui1.json', 'r') as json_file:
    j_data = json.load(json_file)

#keys to be extracted
t_type_keys = ['design_state_data', 'target', 'type']
t_name_keys = ['design_state_data', 'target', 'target']

#extract values using the generic function above
type_val = get_val_json(j_data,t_type_keys)
target_val = get_val_json(j_data,t_name_keys)

print("Type:", type_val)
print("Target:", target_val)

In [None]:
#2.importing datasets and Reading the features 
#.. what missing imputation needs to be applied

import pandas as pd
iris_df = pd.read_csv('iris.csv')
# print(iris_df.head())

#extracting feature handling metrics from json ->j_data
feat_hdg_info = j_data['design_state_data']['feature_handling']

#loooing through features and applying missing value imputaions
for feature_name, feature_details in feat_hdg_info.items():
  if feature_name == 'species': #as this feature doesn't contain any Impute values
        continue
  if feature_details['feature_details']['missing_values'] == 'Impute':
    impute_method = feature_details['feature_details']['impute_with']
    impute_value = feature_details['feature_details']['impute_value']

    if impute_method =='Average of values':
      iris_df[feature_name].fillna(iris_df[feature_name].mean(), inplace=True)
    elif impute_method == 'custom':
      iris_df[feature_name].fillna(impute_value, inplace=True)

      

In [None]:
#3.Computing feature reduction based on input 

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.decomposition import PCA

#generic function for different json files.
def apply_featr_reduc(j_df, reduc_method, sel_features, target_feature=None):
  if reduc_method == "No Reduction":
    reduced_df = j_df[sel_features]

  elif reduc_method == "Corr with Target":
    corr_matrix = j_df[sel_features + [target_feature]].corr()
    sorted_corr = corr_matrix[target_feature].abs().sort_values(ascending=False)
    top_features = sorted_corr[1:].index.tollist()
    reduced_df = j_df[top_features]

  elif reduc_method == "Tree-based":
    X = j_df[sel_features]
    y = j_df[target_feature]

    #performing one-hot encoding as this data contain string values in iris-setosa
    X_encoded = pd.get_dummies(X)

    tree_model = ExtraTreesRegressor(n_estimators=100, random_state=0)
    tree_model.fit(X_encoded,y)

    importances = tree_model.feature_importances_
    indices = importances.argsort()[::-1]

    num_top_features = min(10, len(sel_features))
    top_indices = indices[:num_top_features]

    top_features = X_encoded.columns[top_indices]
    reduced_df = X_encoded[top_features]
  
  elif reduc_method == "PCA":
    n_components = len(sel_features)
    pca = PCA(n_components=n_components)
    reduced_df = pd.Dataframe(pca.fit_transform(j_df[sel_features]), columns=[f'PCA_{i+1}' for i in range(n_components)])
  
  return reduced_df

#loading json file if required .. again
with open('alogparams_from_ui1.json', 'r') as json_file:
    j_data = json.load(json_file)

#selected feature extraction
j_data['design_state_data']['session_info']['dataset'] = 'iris.csv' #change to the json file - not using modified iris. same csv is using
j_df = pd.read_csv(j_data['design_state_data']['session_info']['dataset'])
sel_features = [feature_name for feature_name, feature_info in j_data['design_state_data']['feature_handling'].items() if feature_info['is_selected']]

#reduction method
reduc_method = j_data['design_state_data']['feature_reduction']['feature_reduction_method']

#target features extraction
target_feature = j_data['design_state_data']['target']['target'] if reduc_method == "Corr with Target" or reduc_method == "Tree-based" else None

#APPLYING REDUCTION
reduced_df = apply_featr_reduc(j_df, reduc_method, sel_features, target_feature)

print(reduced_df)
reduced_df.to_csv('iris_modified.csv', index=False)
print("Saved reduced DataFrame to 'iris_modified.csv'")


In [None]:
#4.Parse the Json and make the model objects (using sklean) 

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

with open('alogparams_from_ui1.json', 'r') as json_file:
  j_data = json.load(json_file)

#extract pred type from json
prediction_type = j_data['design_state_data']['target']['prediction_type']
print(prediction_type)

#create a dict for mapping the type to models 

pred_type_model = {
    "Regression":RandomForestRegressor,
    "Classification":RandomForestClassifier
    #I'm just using rfc and rfg because this json file only specifies it. add more 
}

if prediction_type in pred_type_model:
  selected_model_class = pred_type_model[prediction_type]
  selected_model = selected_model_class()
  print("Selected model : ", selected_model)
else:
  print(f"prediction type not specified add new .. i commented the ex... add : {prediction_type}")

In [None]:



#4.1 prediction using RandomForestRegressor()

from pandas.core.arrays import categorical
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error

#load Json if not.. i already loaded on previous cell

#load dataset from ['session_info']['dataset']

target_var = j_data['design_state_data']['target']['target']

features = [feature_name for feature_name, feature_info in j_data['design_state_data']['feature_handling'].items()
            if feature_name != target_var and feature_info['is_selected']]

#converting categ variable to onehot encoded format to convert strings to float

categorical_columns = [feature_name for feature_name, feature_info in j_data['design_state_data']['feature_handling'].items()
                        if feature_name != target_var and feature_info['feature_variable_type'] == 'text']
iris_df = pd.get_dummies(iris_df, columns=categorical_columns, drop_first=True)

# iris_df = iris_df.drop(columns=['species'])
X = iris_df[features]
y = iris_df[target_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#extract rfg from json
rf_regressor_config = j_data['design_state_data']['algorithms']['RandomForestRegressor']

#i removed these params from config as rgc doesn't require , add or remove with additional models 
invalid_params = ['model_name', 'is_selected', 'min_trees', 'max_trees', 'feature_sampling_statergy','min_depth',
                  'min_samples_per_leaf_min_value','min_samples_per_leaf_max_value','parallelism']
for param in invalid_params:
    rf_regressor_config.pop(param, None)

selected_model = RandomForestRegressor(**rf_regressor_config) #instantiate using rfg from json

selected_model.fit(X_train, y_train) #ftting and prediction

y_pred = selected_model.predict(X_test)

#calculating mse
mse = mean_squared_error(y_test,y_pred)
print("mean_squared_error: ",mse)


In [None]:
#5.Run the fit and predict on each model

import json
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score

#first create a dictionary as the json and df are already loaded in above cells.

model_info = {
    "RandomForestRegressor": {"model_class": RandomForestRegressor,"eval_metric":mean_squared_error},
    "RandomForestClassifier":{"model_class": RandomForestClassifier,"eval_metric":accuracy_score},
}

invalid_params = ['model_name', 'is_selected', 'min_trees', 'max_trees', 'feature_sampling_statergy', 'min_depth',
                  'min_samples_per_leaf_min_value', 'min_samples_per_leaf_max_value', 'parallelism']

for model_name,model_config in j_data['design_state_data']['algorithms'].items():
  if model_name not in model_info:
    continue 

  #extract hyperpram from json and remove the others
  hyperparams = {param: value for param, value in model_config.items() if param not in invalid_params}

  model_class = model_info[model_name]['model_class']

  hyperparam_grid = {
      param: [value] if isinstance(value, (int, float)) else value
      for param, value in hyperparams.items()
  }


  grid_search = GridSearchCV(model_class(), hyperparam_grid, cv=5, scoring='neg_mean_squared_error', error_score='raise')
  try:
    grid_search.fit(X_train, y_train)  # Performing tuning using GridSearchCV
  except Exception as e:
    print(f"Fit failed for {model_name} - Exception: {e}")

  best_model = grid_search.best_estimator_
  best_model.fit(X_train,y_train)

  y_pred = best_model.predict(X_test) #making prediction using best fit model

  eval_metric = model_info[model_name]['eval_metric']
  score = eval_metric(y_test, y_pred)

  print(f"{model_name} - Evaluation Metrics : {score}")

