In [6]:
import pandas as pd
import numpy as np  
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import IsolationForest

import shap

rs = 101

In [7]:

import pandas as pd

# Load the .data file into a DataFrame without a header
df = pd.read_csv('/kaggle/input/origin-dataset/agaricus-lepiota.data', delimiter=',', header=None)

# missing - cap-diameter, season, stem-hieght, stem-width
# not in real - odor, stalk-shape, gill-size, ring-number, stalk-surface-above-xring, stalk-color-above-ring, population

# change - ring-number ==> has-ring

# Assign the column names
df.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stem-root', 'stalk-surface-above-ring', 'stem-surface', 'stalk-color-above-ring', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'population', 'habitat']

# Display the first few rows of the DataFrame
print(df.head())

df['cap-diameter'] = np.nan
df['season'] = np.nan
df['stem-height'] = np.nan
df['stem-width'] = np.nan

df = df.drop(['odor', 'stalk-shape', 'gill-size', 'stalk-surface-above-ring', 'stalk-color-above-ring', 'population'], axis = 1)

desired_order = ['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
                 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
                 'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
                 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
                 'habitat', 'season']

df = df[desired_order]

  class cap-shape cap-surface cap-color does-bruise-or-bleed odor  \
0     p         x           s         n                    t    p   
1     e         x           s         y                    t    a   
2     e         b           s         w                    t    l   
3     p         x           y         w                    t    p   
4     e         x           s         g                    f    n   

  gill-attachment gill-spacing gill-size gill-color  ... stem-surface  \
0               f            c         n          k  ...            s   
1               f            c         b          k  ...            s   
2               f            c         b          n  ...            s   
3               f            c         n          n  ...            s   
4               f            w         b          k  ...            s   

  stalk-color-above-ring stem-color veil-type veil-color has-ring ring-type  \
0                      w          w         p          w        o  

In [8]:
import numpy as np
import pandas as pd

df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

df_train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [9]:
import pandas as pd

# Combine df1 and df2 vertically
df_train = pd.concat([df_train, df], axis=0)

# Optionally reset the index if you want a continuous index
df_train = df_train.reset_index(drop=True)

In [10]:
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

# Drop 'id' column
df_train_cleaned = df_train_cleaned.drop(['id'], axis=1)

# Define the target column
target_column = 'class'

# ordinal column
ordinal_columns = np.array(['gill-spacing'])
# ordinal column에서 사용된 데이터의 순서를 정의
gill_spacing_order = [[ 'f', 'Unknown', 'c', 'd']]

# Select categorical columns, excluding the target column
categorical_columns = df_train_cleaned.select_dtypes(include=['object']).columns.drop(target_column)
categorical_columns = categorical_columns.drop('gill-spacing')

# Select numerical columns, excluding the target column if it's numerical
numerical_columns = df_train_cleaned.select_dtypes(exclude=['object']).columns.drop(target_column, errors='ignore')

In [11]:
# Define a function to identify and replace infrequent categories
def replace_infrequent_categories(df, column, threshold=70):
    value_counts = df[column].value_counts()
    infrequent = value_counts[value_counts <= threshold].index
    df[column] = df[column].apply(lambda x: "Unknown" if x in infrequent else x)
    return df

# Handle invalid values and infrequent categories for all categorical columns
for col in categorical_columns:
    df_train_cleaned = replace_infrequent_categories(df_train_cleaned, col)
    df_test_cleaned = replace_infrequent_categories(df_test_cleaned, col)

# ordinal column에도 똑같이 적용
df_train_cleaned = replace_infrequent_categories(df_train_cleaned, ordinal_columns[0])
df_test_cleaned = replace_infrequent_categories(df_test_cleaned, ordinal_columns[0])

In [12]:
# Compute medians for numerical columns in the training set
medians = df_train_cleaned[numerical_columns].median()

# Fill missing values in the training and testing sets
df_train_cleaned[numerical_columns] = df_train_cleaned[numerical_columns].fillna(medians)
df_test_cleaned[numerical_columns] = df_test_cleaned[numerical_columns].fillna(medians)

In [13]:
# Impute any missing values with 'Unknown'
df_train_cleaned = df_train_cleaned.fillna("Unknown")
df_test_cleaned = df_test_cleaned.fillna("Unknown")

In [14]:
#ㅁㄴㅇ
df_train_cleaned = df_train_cleaned.drop_duplicates()

In [15]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
train_encoded_target = label_encoder.fit_transform(df_train_cleaned[['class']])

# Convert categorical columns to 'category' dtype 
df_train_cleaned[categorical_columns] = df_train_cleaned[categorical_columns].astype('category')
df_test_cleaned[categorical_columns] = df_test_cleaned[categorical_columns].astype('category')

# Convert ordinal columns to 'category' dtype 
df_train_cleaned[ordinal_columns] = df_train_cleaned[ordinal_columns].astype('category')
df_test_cleaned[ordinal_columns] = df_test_cleaned[ordinal_columns].astype('category')

# Define the numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

# Define the categorical pipeline
ordinal_pipeline = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1, categories=gill_spacing_order))
])

# Define the categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', dtype=np.int32, handle_unknown='ignore'))
])

# Combine both numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('ord', ordinal_pipeline, ordinal_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)


# Apply the transformations using the pipeline
df_train_encoded = preprocessor.fit_transform(df_train_cleaned)
df_test_encoded = preprocessor.transform(df_test_cleaned)

# Ensure outputs are dense arrays
train_encoded_dense = df_train_encoded.toarray()
test_encoded_dense = df_test_encoded.toarray()

# Get feature names
numerical_feature_names = numerical_columns  # Assuming numerical columns do not change names
ordinal_feature_names = ordinal_columns
categorical_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_columns)

# Combine the feature names
all_feature_names = np.concatenate([numerical_feature_names, ordinal_feature_names, categorical_feature_names])

# Debugging: Print the number of feature names
print("Number of Features:", len(all_feature_names))

# Convert the transformed dense arrays back into DataFrames
df_train_preprocessed = pd.DataFrame(train_encoded_dense, columns=all_feature_names)
df_test_preprocessed = pd.DataFrame(test_encoded_dense, columns=all_feature_names)

Number of Features: 130


In [16]:
# Apply Isolation Forest for outlier detection
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
outlier_labels = isolation_forest.fit_predict(df_train_preprocessed)

# Filter out outliers
non_outliers_mask = outlier_labels != -1
df_train_preprocessed = df_train_preprocessed[non_outliers_mask]
train_encoded_target = train_encoded_target[non_outliers_mask]

In [17]:
high_negative_shap = ['cap-surface_l', 'gill-color_e', 'cap-color_b', 'gill-color_f', 'veil-color_y', 'stem-color_b', 'ring-type_m', 'stem-color_l']
# 보류: 'cap-color_o', 'habitat_g', 'cap-surface_y' not working as expected
# 'gill-attachment_f', 'habitat_l'

df_train_preprocessed = df_train_preprocessed.drop(high_negative_shap, axis = 1)
df_test_preprocessed = df_test_preprocessed.drop(high_negative_shap, axis = 1)

In [18]:
# Separate features (X) and target variable (y)
X = df_train_preprocessed
y = train_encoded_target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

In [19]:
len(train_encoded_target)

2962892

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [18]:
import torch

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the ANN model
model = Sequential()

# Adding the input layer and the first hidden layer
model.add(Dense(units=32, activation='relu', input_shape=(X_train.shape[1],)))

# Adding more hidden layers (optional)
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))

# Adding the output layer
# For binary classification, use sigmoid activation
# For multi-class classification, use softmax activation
model.add(Dense(units=1, activation='sigmoid'))

In [19]:
import tensorflow.keras.backend as K
def mcc_loss(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
    
    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos
    
    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)
    
    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)
    
    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + K.epsilon())
    
    mcc = numerator / (denominator + K.epsilon())
    
    return 1 - mcc  # Since we're minimizing the loss, we subtract from 1

In [24]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',  # or 'categorical_crossentropy' for multi-class
              metrics=['accuracy',mcc_loss])

In [28]:
# Set up early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=4, validation_split=0.2, callbacks=[early_stopping])

torch.Size([1480550, 2, 17])
Epoch [1/10], Loss: 0.2553
Epoch [2/10], Loss: 0.2531
Epoch [3/10], Loss: 0.2504
Epoch [4/10], Loss: 0.2520
Epoch [5/10], Loss: 0.2525
Epoch [6/10], Loss: 0.2462
Epoch [7/10], Loss: 0.2484
Epoch [8/10], Loss: 0.2671
Epoch [9/10], Loss: 0.2365
Epoch [10/10], Loss: 0.2436
Training completed.


In [29]:
y_pred = model.predict(X_test)
y_pred = [0 if x < 0.5 else 1 for x in y_pred3]

# Evaluate the model using Matthews correlation coefficient
mcc = matthews_corrcoef(y_test, y_pred3)
print("Matthews Correlation Coefficient:", mcc) #0.9710405490083444

Encoded Representations (Before Projection Head):
Anchor: tensor([[5.0767, 0.0000, 0.1394,  ..., 0.0000, 0.4668, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [2.2037, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.1411],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 3.4046,  ..., 0.0000, 0.0000, 0.0000]])
Positive/Negative: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.7291, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.1231, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.2017,  ..., 0.6955, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

Final Projections (After Projection Head):
Anchor: tensor([[-0.0996, -0.0183,  0.0113,  ..., -0.0133, -0.0033, -0.1068],
     

In [35]:
print(df_train['class'].value_counts())
print(df_train['class'].value_counts() / len(df_train))

class
p    1705396
e    1411549
Name: count, dtype: int64
class
p    0.547137
e    0.452863
Name: count, dtype: float64


In [36]:
result = pd.DataFrame(y_pred)
print("result of train: ", result[0].value_counts())
print(result[0].value_counts() / len(y_pred))

result of train:  0
1    322504
0    269716
Name: count, dtype: int64
0
1    0.544568
0    0.455432
Name: count, dtype: float64


In [37]:
test_preds = xgb_model.predict(df_test_preprocessed)
test_preds = label_encoder.inverse_transform(test_preds)

In [38]:
result = pd.DataFrame(df_test['id'])
result['class'] = test_preds
print("result of train: ", result['class'].value_counts())
print(result['class'].value_counts() / len(test_preds))

result of train:  class
p    1131251
e     946713
Name: count, dtype: int64
class
p    0.544404
e    0.455596
Name: count, dtype: float64


In [39]:
result = pd.DataFrame(df_test['id'])
result['class'] = test_preds
print("result of train: ", result['class'].value_counts())
print(result['class'].value_counts() / len(test_preds))

result of train:  class
p    1131251
e     946713
Name: count, dtype: int64
class
p    0.544404
e    0.455596
Name: count, dtype: float64


In [40]:
output = pd.DataFrame({'id': df_test['id'],
                       'class': test_preds})

output.to_csv('submission.csv', index=False)

output.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
