In [1]:
!pip install striprtf

Collecting striprtf
  Downloading striprtf-0.0.26-py3-none-any.whl (6.9 kB)
Installing collected packages: striprtf
Successfully installed striprtf-0.0.26


TASK 1: LOAD DATA AND READ TARGET AND TYPE OF REGRESSION

In [2]:
def read_rtf_file(file_path):
    from striprtf.striprtf import rtf_to_text
    with open(file_path, 'r') as file:
        rtf_content = file.read()
    return rtf_to_text(rtf_content)

In [3]:
def load_and_parse_json(file_path):
    import json
    text = read_rtf_file(file_path)
    json_data = json.loads(text)
    target_details = json_data.get('design_state_data', {}).get('target', {})
    prediction_type = target_details.get('prediction_type', 'Unknown')
    target_variable = target_details.get('target', 'No target specified')
    regression_type = target_details.get('type', 'No regression type specified')

    return prediction_type, target_variable, regression_type, json_data

In [4]:
file_path = 'algoparams_from_ui.json.rtf'
prediction_type, target_variable, regression_type,  json_data = load_and_parse_json(file_path)
print("Prediction Type:", prediction_type)
print("Target Variable:", target_variable)
print("Regression Type:", regression_type)

Prediction Type: Regression
Target Variable: petal_width
Regression Type: regression


TASK 2: IMPUTATION

In [5]:
def load_data_apply_imputation(csv_file_path, json_data):
    import pandas as pd
    df = pd.read_csv(csv_file_path)
    feature_handling = json_data.get('design_state_data', {}).get('feature_handling', {})
    features_imputation = {}
    for feature, details in feature_handling.items():
        if details.get('is_selected', False):
            impute_info = details.get('feature_details', {}).get('impute_with', 'default_strategy')
            impute_value = details.get('feature_details', {}).get('impute_value', None)
            features_imputation[feature] = {'impute_with': impute_info, 'impute_value': impute_value}

    for feature, details in features_imputation.items():
        if details['impute_with'] == 'Average of values':
            print(f"Applying mean imputation to {feature}")
            df[feature].fillna(df[feature].mean(), inplace=True)
        elif details['impute_with'] == 'custom' and details['impute_value'] is not None:
            print(f"Applying custom imputation with value {details['impute_value']} to {feature}")
            df[feature].fillna(details['impute_value'], inplace=True)
        elif details['impute_with'] == 'default_strategy':
            print(f"Applying mode imputation to {feature}")
            df[feature].fillna(df[feature].mode()[0], inplace=True)

    return df

In [6]:
csv_file_path = 'iris.csv'
df_processed = load_data_apply_imputation(csv_file_path, json_data)
print(df_processed.head())

Applying mean imputation to sepal_length
Applying custom imputation with value -1 to sepal_width
Applying mean imputation to petal_length
Applying custom imputation with value -2 to petal_width
Applying mode imputation to species
   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [7]:
def encode_categorical(df):
    from sklearn.preprocessing import LabelEncoder
    for column in df_processed.columns:
        if df_processed[column].dtype == 'object':
            le = LabelEncoder()
            df_processed[column] = le.fit_transform(df_processed[column])
    return df_processed

TASK 3: FEATURE REDUCTION

In [8]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import numpy as np
def apply_feature_reduction(df, json_data, target_variable):
    df_processed = encode_categorical(df.copy())
    reduction_config = json_data['design_state_data']['feature_reduction']
    method = reduction_config['feature_reduction_method']
    num_features = int(reduction_config['num_of_features_to_keep'])
    prediction_type = json_data['design_state_data']['target']['prediction_type']

    if method == 'No Reduction':
        reduced_df = df_processed
    elif method == 'Corr with Target':
        numeric_df = df_processed.select_dtypes(include=[np.number])
        correlation = numeric_df.corr()[target_variable].abs()
        top_features = correlation.sort_values(ascending=False).index[1:num_features+1]
        reduced_df = df[top_features]
    elif method == 'Tree-based':
        X = df_processed.drop(target_variable, axis=1)
        y = df_processed[target_variable]
        tree_model = RandomForestRegressor(n_estimators=int(reduction_config['num_of_trees']), max_depth=int(reduction_config['depth_of_trees']))
        tree_model.fit(X, y)
        importances = tree_model.feature_importances_
        top_indices = np.argsort(importances)[::-1][:num_features]
        reduced_df = df_processed[df.columns[top_indices]]
    elif method == 'PCA':
        numeric_df = df_processed.select_dtypes(include=[np.number])
        pca = PCA(n_components=num_features)
        principal_components = pca.fit_transform(numeric_df.drop(target_variable, axis=1))
        reduced_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(num_features)])

    return reduced_df

In [9]:
reduced_df = apply_feature_reduction(df_processed, json_data, target_variable)
print(reduced_df.head())

   petal_width  petal_length  sepal_width  sepal_length
0          0.2           1.4          3.5           5.1
1          0.2           1.4          3.0           4.9
2          0.2           1.3          3.2           4.7
3          0.2           1.5          3.1           4.6
4          0.2           1.4          3.6           5.0


TASK 4: MODEL INITIALIZATION

In [10]:
import json
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


def initialize_models(json_data):
    model_configs = json_data['design_state_data']['algorithms']
    models = []
    model_classes = {
        'RandomForestRegressor': RandomForestRegressor,
        'RandomForestClassifier': RandomForestClassifier,
        'GBTClassifier': GradientBoostingClassifier,
        'GBTRegressor':  GradientBoostingRegressor,
        'LinearRegression': LinearRegression,
        'LogisticRegression': LogisticRegression,
        'RidgeRegression': Ridge,
        'LassoRegression': Lasso,
        'ElasticNetRegression': ElasticNet,
        'xg_boost': XGBRegressor,
        'DecisionTreeRegressor': DecisionTreeRegressor,
        'DecisionTreeClassifier': DecisionTreeClassifier,
        'SVM': SVC,
        'SGD': SGDClassifier,
        'KNN': KNeighborsClassifier,
        'extra_random_trees': ExtraTreesClassifier,
        'neural_network': MLPClassifier
         }

    for model_name, config in model_configs.items():
        if config['is_selected']:
            model_class = model_classes.get(model_name)
            if model_class:
                param_grid = {}
                if model_name == 'RandomForestRegressor':
                    param_grid['n_estimators'] = list(range(config['min_trees'], config['max_trees'] + 1))
                    param_grid['max_depth'] = list(range(config['min_depth'], config['max_depth'] + 1))
                    param_grid['min_samples_leaf'] = list(range(config['min_samples_per_leaf_min_value'], config['min_samples_per_leaf_max_value'] + 1))

                # Parameters for other models

                model_info = {
                    'model': model_class(),
                    'params': param_grid
                }
                models.append(model_info)
    return models


In [11]:
models = initialize_models(json_data)
for model in models:
    print(model)


{'model': RandomForestRegressor(), 'params': {'n_estimators': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'max_depth': [20, 21, 22, 23, 24, 25], 'min_samples_leaf': [5, 6, 7, 8, 9, 10]}}


TASK 5: HYPERPARAMETER TUNING


In [12]:
def preprocess_data(reduced_df, target_variable):
    y = reduced_df[target_variable]
    X = reduced_df.drop(target_variable, axis=1)
    return X, y

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def fit_predict_models(models,json_data, df, target_variable):
    X, y = preprocess_data(df, target_variable)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results = []

    for model_config in models:
        model = model_config['model']
        param_grid = model_config['params']
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        results.append((type(best_model).__name__, 'MSE', mse))
        results.append((type(best_model).__name__, 'RMSE', rmse))
        results.append((type(best_model).__name__, 'MAE', mae))
        results.append((type(best_model).__name__, 'R2', r2))

        print(f"Best parameters for {type(best_model).__name__}: {grid_search.best_params_}")
        print(f"Evaluation metrics for {type(best_model).__name__}: MSE={mse}, RMSE={rmse}, MAE={mae}, R2={r2}")

    return results

TASK 6: RESULTS

In [14]:
models = initialize_models(json_data)
fit_predict_results = fit_predict_models(models, json_data, reduced_df, target_variable)

Best parameters for RandomForestRegressor: {'max_depth': 24, 'min_samples_leaf': 8, 'n_estimators': 15}
Evaluation metrics for RandomForestRegressor: MSE=0.0350368350359278, RMSE=0.18718128922498584, MAE=0.1494706710275019, R2=0.9448807853094181
