In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from category_encoders import BinaryEncoder, BaseNEncoder, CountEncoder, TargetEncoder, CatBoostEncoder, \
                             JamesSteinEncoder, MEstimateEncoder, LeaveOneOutEncoder, PolynomialEncoder, \
                             HelmertEncoder, BackwardDifferenceEncoder, HashingEncoder, SumEncoder
import numpy as np
import os


from scipy import sparse


In [26]:

# Load the dataset
data = pd.read_csv('data22.csv')


# Create a folder to store encoded files
if not os.path.exists('encoded_data'):
    os.makedirs('encoded_data')


# Explicitly convert columns to categorical if they are not already
categorical_columns = ['package', 'preservative', 'sterilization', 'temperature']  # Add 'temperature' if it's categorical too
for col in categorical_columns:
    data[col] = data[col].astype('category')


In [9]:

# Method 1: Label Encoding
le = LabelEncoder()
le_data = data.apply(le.fit_transform)
le_data.to_csv('encoded_data/label_encoded.csv', index=False)


In [10]:

# Method 2: Ordinal Encoding
oe = OrdinalEncoder()
oe_data = oe.fit_transform(data)
oe_data = pd.DataFrame(oe_data, columns=data.columns)
oe_data.to_csv('encoded_data/ordinal_encoded.csv', index=False)


In [11]:

# Method 3: One-hot encoding
ohe = OneHotEncoder()
ohe_data = ohe.fit_transform(data)
# Update the method call below
ohe_data = pd.DataFrame(ohe_data.toarray(), columns=ohe.get_feature_names_out(data.columns))

# Drop the individual target columns
ohe_data.drop(columns=ohe_data.filter(like='target').columns, inplace=True)

# Add the 'target' column from the original data
ohe_data['target'] = data['target']


ohe_data.to_csv('encoded_data/onehot_encoded.csv', index=False)



In [12]:

# Method 4: Binary Encoding
be = BinaryEncoder()
be_data = be.fit_transform(data)
be_data.to_csv('encoded_data/binary_encoded.csv', index=False)


In [13]:

# Method 5: Base-N Encoding
bne = BaseNEncoder(base=3)
bne_data = bne.fit_transform(data)
bne_data.to_csv('encoded_data/basen_encoded.csv', index=False)


In [14]:
# Method 6: Frequency Encoding
if 'package' in data.columns:
    data['frequency_encoded'] = data['package'].map(data['package'].value_counts(normalize=True))
    data.to_csv('encoded_data/frequency_encoded.csv', index=False)
else:
    print("Column 'package' does not exist in the DataFrame.")


In [15]:

# Method 7: Hashing Encoding
hasher = HashingEncoder(n_components=8)
hashed_data = hasher.fit_transform(data)
hashed_data.to_csv('encoded_data/hashing_encoded.csv', index=False)


In [16]:

# Method 8: Helmert Encoding
he = HelmertEncoder()
he_data = he.fit_transform(data)

he_data.drop(columns=he_data.filter(like='frequency').columns, inplace=True)

he_data.to_csv('encoded_data/helmert_encoded.csv', index=False)




In [17]:

# Method 9: Sum Coding
sum_encoder = SumEncoder()
sum_data = sum_encoder.fit_transform(data)

sum_data.drop(columns=sum_data.filter(like='frequency').columns, inplace=True)

sum_data.to_csv('encoded_data/sum_encoded.csv', index=False)




In [18]:
# Method 10: Backward Difference Encoding
bde = BackwardDifferenceEncoder()
bde_data = bde.fit_transform(data)

bde_data.drop(columns=bde_data.filter(like='frequency').columns, inplace=True)

bde_data.to_csv('encoded_data/backwarddifference_encoded.csv', index=False)




In [19]:

# Method 11: Polynomial Encoding
pe = PolynomialEncoder()
pe_data = pe.fit_transform(data[categorical_columns])

# Add the 'target' column from the original data
pe_data['target'] = data['target']

pe_data.to_csv('encoded_data/polynomial_encoded.csv', index=False)



In [27]:

# Method 12: Leave-One-Out Encoding
loe = LeaveOneOutEncoder()
loe_data = loe.fit_transform(data, data['target'])

loe_data.drop(columns=loe_data.filter(like='frequency').columns, inplace=True)


loe_data.to_csv('encoded_data/leaveoneout_encoded.csv', index=False)



In [21]:


# Method 13: CatBoost Encoding
cbe = CatBoostEncoder()
cbe_data = cbe.fit_transform(data, data['target'])

cbe_data.drop(columns=cbe_data.filter(like='frequency').columns, inplace=True)

cbe_data.to_csv('encoded_data/catboost_encoded.csv', index=False)



In [22]:


# Method 14: James-Stein Encoding
jse = JamesSteinEncoder()
jse_data = jse.fit_transform(data, data['target'])

# jse_data.drop(columns=jse_data.filter(like='frequency').columns, inplace=True)


jse_data.to_csv('encoded_data/jamesstein_encoded.csv', index=False)


In [23]:
# Method 15: M-Estimate Encoding
me = MEstimateEncoder()
me_data = me.fit_transform(data, data['target'])

me_data.drop(columns=me_data.filter(like='frequency').columns, inplace=True)

me_data.to_csv('encoded_data/mestimate_encoded.csv', index=False)


In [24]:
# Method 16: Multiple Correspondence Analysis
import pandas as pd
import prince  # For Multiple Correspondence Analysis

# Assuming 'data' has been loaded and consists of categorical variables

# Initialize and fit MCA
# Number of components can be chosen based on the number of categorical features or desired dimensionality
mca = prince.MCA(n_components=4, random_state=42)
mca_data = mca.fit(data)

# Transform the dataset and create a DataFrame of the transformed data
mca_transformed = mca.transform(data)
mca_transformed.columns = [f'MCA_{i}' for i in range(mca_transformed.shape[1])]

mca_transformed['target'] = data['target']


mca_transformed.to_csv('encoded_data/mca_encoded.csv', index=False)

