In [2]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

if "/home/coberndorm/Documents/Semestre_IX/Artificial_Intelligence" not in sys.path:
  sys.path.append("/home/coberndorm/Documents/Semestre_IX/Artificial_Intelligence/")

In [3]:
import MLP.preprocessing as dm
import MLP.plottingFunctions as pltf

In [4]:
from sklearn.datasets import load_iris

iris= load_iris()
data = iris.data

# Normalize data
data = dm.normalize_min_max(data)

In [5]:
def one_hot_encoding(data: np.ndarray, categorical: list, return_idx = False) -> np.ndarray:
    """
    One-hot encodes categorical variables in a NumPy array.

    Args:
        data: A NumPy array containing the data to be encoded.
        categorical: A list of integers representing the indices of the categorical variables in the data array.
        return_idx: A boolean value indicating whether to return the indices of the encoded categorical variables.

    Returns:
        A NumPy array containing the one-hot encoded data. If `return_idx` is True, a tuple containing the encoded data and the indices of the encoded categorical variables is returned.
    """
     # Calculate the number of rows and columns in the data array.
    n, m = data.shape

    # Create lists of the indices of the non-categorical and categorical variables.
    idx_non_categorical = [i for i in range(m) if i not in categorical]
    idx = categorical

    # Create a new NumPy array containing the data from the non-categorical variables.
    data_processed = data[:, idx_non_categorical]

    # Iterate over the categorical variables and one-hot encode each variable.
    for col in categorical:
        # Get the categorical variable and its unique values.
        categorical_col = data[:, col]
        vals = set(categorical_col)

        # For each column in cols_to_add, set the value of the column to 1 if the corresponding data point belongs to the corresponding category, and 0 otherwise.
        cols_to_add = [[] * len(vals)]
        for j in vals:
            cols_to_add = [1 if val == j else 0 for val in categorical_col]

            # Append the cols_to_add list to the data_processed array and update the idx list.
            data_processed = np.column_stack((data_processed, cols_to_add))
            idx = idx + [col]

    if return_idx:  return data_processed, idx
    else: return data_processed

In [6]:
def is_categorical(data):
    """
    Determine categorical columns in a data matrix.

    Args:
        data (numpy.ndarray): Input data matrix, where rows represent samples and columns represent features.

    Returns:
        list: A list of column indices that are considered categorical based on a threshold of unique values.
    """
    
    # Get the dimensions of the data matrix (number of rows and columns)
    n, m = data.shape
    
    # Create an empty list to store the indices of categorical columns
    categorical = []

    for i in range(m):
        # Get the set of all unique values in the current column
        all_values = set(data[:, i]) 
        
        # Check if the number of unique values is less than or equal to 10
        if len(all_values) <= 10:
            # If so, consider the column as categorical and add its index to the list
            categorical.append(i)
    
    # Return the list of indices of categorical columns
    return categorical


In [7]:
import pandas as pd
y = [int(x) for x in iris.target]
data_ = np.column_stack((data,y))
data_processed = pd.DataFrame(one_hot_encoding(data_, [4]))

In [8]:
df = pd.read_csv("test.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [11]:
one_hot_encoding(np.array(df),is_categorical(np.array(df)))

array([[1461, 20, 80.0, ..., 0, 1, 0],
       [1462, 20, 81.0, ..., 0, 1, 0],
       [1463, 60, 74.0, ..., 0, 1, 0],
       ...,
       [2917, 20, 160.0, ..., 0, 0, 0],
       [2918, 85, 62.0, ..., 0, 1, 0],
       [2919, 60, 74.0, ..., 0, 1, 0]], dtype=object)