In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd

import sys
sys.path.append("..")
import pickle
from tqdm import tqdm
import sys
from scipy import optimize
from data_loaders import *
import missing_process.missing_method as missing_method
from missing_process.block_rules import *
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
plt.rcParams['text.usetex'] = False

In [4]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.pipeline import make_pipeline

def basis_expansion(X):

    # Polynomial Features up to degree 2
    poly = PolynomialFeatures(degree=3, include_bias=True)
    X_poly = poly.fit_transform(X)

    # Circular functions: sine and cosine
    X_sin = np.sin(X)
    X_cos = np.cos(X)

    # Logarithmic Transformation
    log_transformer = FunctionTransformer(np.log1p, validate=True)
    X_log = log_transformer.fit_transform(X)

    # Square Root Transformation
    sqrt_transformer = FunctionTransformer(np.sqrt, validate=True)
    X_sqrt = sqrt_transformer.fit_transform(X)

    # Exponential Transformation
    X_exp = np.exp(X)

    # Gaussian Transformation
    def gaussian_basis(X):
        return np.exp(-X ** 2)

    gaussian_transformer = FunctionTransformer(gaussian_basis, validate=True)
    X_gaussian = gaussian_transformer.fit_transform(X)

    # Sigmoid Transformation
    def sigmoid_basis(X):
        return 1 / (1 + np.exp(-X))

    sigmoid_transformer = FunctionTransformer(sigmoid_basis, validate=True)
    X_sigmoid = sigmoid_transformer.fit_transform(X)

    # Custom Basis Function sin+cos
    def custom_basis(X):
        return np.sin(X) + np.cos(X)

    custom_transformer = FunctionTransformer(custom_basis, validate=True)
    X_custom = custom_transformer.fit_transform(X)

    # Concatenate original feature with expanded features
    X_concatenated = np.hstack((X, X_poly, X_sin, X_cos, X_log, X_sqrt, X_exp, X_gaussian, X_sigmoid, X_custom))

    # Check the shape of the concatenated data
    return X_concatenated

In [5]:
def check_path(directory):
    if not os.path.exists(directory):
        # If it doesn't exist, create it
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

In [22]:
def scatter_plot(sorted_values):
        # Plot
    plt.figure(figsize=(8, 6))
    plt.plot(range(len(sorted_values)), sorted_values)
    plt.title('Sorted Values from Calith 7th Column')
    plt.xlabel('Sorted Index')
    plt.ylabel('Values')
    plt.grid(True)
    plt.show()

def create_feature_expansion(missingtype = "test_MNAR_1",dataname = "california"):
    save_name = dataname+"_exp"
    if missingtype == "test_MNAR_1":    
        missing_rule = load_json_file(f"{missingtype}.json")
        missingtype = "logistic"
        print("logistic")
    else:
        missing_rule = load_json_file(f"{missingtype}.json")
        missingtype = "quantile"


    directory_path = f"../datasets/{dataname}" 
    save_path =  f"../datasets/{save_name}"
    norm_values = np.load(f'{directory_path}/{dataname}_norm.npy')


    array_left = []
    array_right = []
    for i in range(norm_values.shape[1]):
        print(i)
        if i != 6:

            target_column = norm_values[:, i].reshape(-1, 1)
            expanded_target = basis_expansion(target_column)

            # # Split array1 into two parts based on the insertion column
            # array_left = norm_values[:, :6]
            # array_right = norm_values[:, 6:]

            # Insert array2 between the two parts
            result = np.hstack((target_column,expanded_target))
            if i < 6:
                array_left.append(result)
            else:
                array_right.append(result)

    result = np.hstack((np.hstack(array_left),norm_values[:, 6].reshape(-1, 1),np.hstack(array_right)))

    check_path(save_path)
    np.save(f'{save_path}/{save_name}_norm.npy', result)
    #print(result.shape)

    #return target_column

    # for missingtype in missing_list.keys():
    #     missing_rule = missing_list[missingtype]
    #     #print(missingtype,missing_rule)

    print(missing_rule)
    for rule_name in missing_rule:
        #print(rule_name)
        rule = missing_rule[rule_name]
        
        observed_masks = np.load(f'{directory_path}/{missingtype}/{rule_name}.npy')

        mask_left = []
        mask_right = []
        for i in range(norm_values.shape[1]):
            if i != 6:
                expand_mask = np.hstack([observed_masks[:, i].reshape(-1, 1)] * (expanded_target.shape[1]+1))
                if i < 6:
                    mask_left.append(expand_mask)
                else:
                    mask_right.append(expand_mask)



        result_mask = np.hstack((np.hstack(mask_left), observed_masks[:, 6].reshape(-1, 1), np.hstack(mask_right)))
        check_path(f'{save_path}/{missingtype}')
        print(result.shape)
        print(result_mask.shape)
        np.save(f'{save_path}/{missingtype}/{rule_name}.npy', result_mask)



In [25]:
create_feature_expansion(missingtype = "test_MNAR_2")

0
1
2
3
4
5
6
7
Directory '../datasets/california_exp' already exists.
{'Q1_Q2_0.5': {'1': {'lower': 0.0, 'upper': 0.25, 'partial_missing': 0.5}, '2': {'lower': 0.25, 'upper': 0.5, 'partial_missing': 0.5}}, 'Q1_Q4_0.5': {'1': {'lower': 0.0, 'upper': 0.25, 'partial_missing': 0.5}, '2': {'lower': 0.75, 'upper': 1, 'partial_missing': 0.5}}, 'Q1_Q4_0.25': {'1': {'lower': 0.0, 'upper': 0.25, 'partial_missing': 0.75}, '2': {'lower': 0.75, 'upper': 1, 'partial_missing': 0.75}}, 'Q2_Q3_0.25': {'1': {'lower': 0.25, 'upper': 0.5, 'partial_missing': 0.75}, '2': {'lower': 0.5, 'upper': 0.75, 'partial_missing': 0.75}}, 'Q1_Q2_1.0': {'1': {'lower': 0.0, 'upper': 0.25, 'partial_missing': 0.0}, '2': {'lower': 0.25, 'upper': 0.5, 'partial_missing': 0.0}}}
Directory '../datasets/california_exp/quantile' already exists.
(20640, 99)
(20640, 99)
Directory '../datasets/california_exp/quantile' already exists.
(20640, 99)
(20640, 99)
Directory '../datasets/california_exp/quantile' already exists.
(20640, 99)

In [26]:
os.getcwd()


'f:\\Deakin\\MNAR_exp\\generation'

(20640, 13)

(20640, 1)