In [3]:
import pandas as pd
import numpy as np 
from scipy import stats
import seaborn as sns 
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import IsolationForest

palette = sns.color_palette("Spectral", n_colors=13) 
sns.set_theme(context='notebook', palette=palette, style='darkgrid')
rs = 101

In [4]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

df_train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [5]:
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

# Drop 'id' column
df_train_cleaned = df_train_cleaned.drop(['id'], axis=1)

# Define the target column
target_column = 'class'

ordinal_columns = np.array(['gill-spacing'])
gill_spacing_order = np.array([ 'f', 'Unknown', 'c', 'd'])

# Select categorical columns, excluding the target column
categorical_columns = df_train_cleaned.select_dtypes(include=['object']).columns.drop(target_column)
categorical_columns = categorical_columns.drop('gill-spacing')

# Select numerical columns, excluding the target column if it's numerical
numerical_columns = df_train_cleaned.select_dtypes(exclude=['object']).columns.drop(target_column, errors='ignore')

In [6]:
# Define a function to identify and replace infrequent categories
def replace_infrequent_categories(df, column, threshold=70):
    value_counts = df[column].value_counts()
    infrequent = value_counts[value_counts <= threshold].index
    df[column] = df[column].apply(lambda x: "Unknown" if x in infrequent else x)
    return df

# Handle invalid values and infrequent categories for all categorical columns
for col in categorical_columns:
    df_train_cleaned = replace_infrequent_categories(df_train_cleaned, col)
    df_test_cleaned = replace_infrequent_categories(df_test_cleaned, col)

df_train_cleaned = replace_infrequent_categories(df_train_cleaned, ordinal_columns[0])
df_test_cleaned = replace_infrequent_categories(df_test_cleaned, ordinal_columns[0])

In [7]:
df_train_cleaned[numerical_columns].apply(lambda x: stats.skew(x.dropna()))

cap-diameter    3.972607
stem-height     1.926681
stem-width      1.235426
dtype: float64

In [8]:
# Compute medians for numerical columns in the training set
medians = df_train_cleaned[numerical_columns].median()

# Fill missing values in the training and testing sets
df_train_cleaned[numerical_columns] = df_train_cleaned[numerical_columns].fillna(medians)
df_test_cleaned[numerical_columns] = df_test_cleaned[numerical_columns].fillna(medians)

In [9]:
# Impute any missing values with 'Unknown'
df_train_cleaned = df_train_cleaned.fillna("Unknown")
df_test_cleaned = df_test_cleaned.fillna("Unknown")

In [10]:
df_train_cleaned = df_train_cleaned.drop_duplicates()

In [11]:
# Calculate the Z-scores for the numerical columns in the DataFrame
z_scores = stats.zscore(df_train_cleaned[numerical_columns])

# Generate descriptive statistics for the Z-scores and round the results to 3 decimal places
z_scores.describe().round(3)

Unnamed: 0,cap-diameter,stem-height,stem-width
count,3116943.0,3116943.0,3116943.0
mean,0.0,0.0,0.0
std,1.0,1.0,1.0
min,-1.348,-2.351,-1.378
25%,-0.642,-0.622,-0.764
50%,-0.12,-0.173,-0.186
75%,0.414,0.393,0.553
max,15.964,30.511,11.333


In [12]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
train_encoded_target = label_encoder.fit_transform(df_train_cleaned[['class']])

# Convert categorical columns to 'category' dtype 
df_train_cleaned[categorical_columns] = df_train_cleaned[categorical_columns].astype('category')
df_test_cleaned[categorical_columns] = df_test_cleaned[categorical_columns].astype('category')

df_train_cleaned[ordinal_columns] = df_train_cleaned[ordinal_columns].astype('category')
df_test_cleaned[ordinal_columns] = df_test_cleaned[ordinal_columns].astype('category')


# Ordinal Encoder parameter
gill_spacing_order = [[ 'f', 'Unknown', 'c', 'd']]

# Define the numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

# Define the categorical pipeline
ordinal_pipeline = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1, categories=gill_spacing_order))
])

# Define the categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', dtype=np.int32, handle_unknown='ignore'))
])

# Combine both numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('ord', ordinal_pipeline, ordinal_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)


# Apply the transformations using the pipeline
df_train_encoded = preprocessor.fit_transform(df_train_cleaned)
df_test_encoded = preprocessor.transform(df_test_cleaned)

# Ensure outputs are dense arrays
train_encoded_dense = df_train_encoded.toarray()
test_encoded_dense = df_test_encoded.toarray()

# Get feature names
numerical_feature_names = numerical_columns  # Assuming numerical columns do not change names
ordinal_feature_names = ordinal_columns
categorical_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_columns)

# Combine the feature names
all_feature_names = np.concatenate([numerical_feature_names, ordinal_feature_names, categorical_feature_names])

# Debugging: Print the number of feature names
print("Number of Features:", len(all_feature_names))

# Convert the transformed dense arrays back into DataFrames
df_train_preprocessed = pd.DataFrame(train_encoded_dense, columns=all_feature_names)
df_test_preprocessed = pd.DataFrame(test_encoded_dense, columns=all_feature_names)

Number of Features: 120


In [13]:
# Apply Isolation Forest for outlier detection
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
outlier_labels = isolation_forest.fit_predict(df_train_preprocessed)

# Filter out outliers
non_outliers_mask = outlier_labels != -1
df_train_preprocessed = df_train_preprocessed[non_outliers_mask]
train_encoded_target = train_encoded_target[non_outliers_mask]

In [14]:
high_negative_shap = ['cap-surface_l', 'gill-color_e', 'cap-color_b', 'gill-color_f', 'veil-color_y', 'stem-color_b', 'ring-type_m', 'stem-color_l']
# 보류: 'cap-color_o', 'habitat_g', 'cap-surface_y' not working as expected
# 'gill-attachment_f', 'habitat_l'

df_train_preprocessed = df_train_preprocessed.drop(high_negative_shap, axis = 1)
df_test_preprocessed = df_test_preprocessed.drop(high_negative_shap, axis = 1)

In [15]:
# Separate features (X) and target variable (y)
X = df_train_preprocessed
y = train_encoded_target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=rs)

In [16]:
#duplicate
params = {
    'colsample_bytree': 0.2982237397804184, 
    'learning_rate': 0.006240091697982013, 
    'max_depth': 17, 
    'min_child_weight': 0.34115942918259007, 
    'n_estimators': 4000, 
    'subsample': 1.0,
    'reg_alpha': 7.416406249487463,
    'reg_lambda': 1e-09,
    'use_label_encoder': False,  
    'eval_metric': 'mlogloss',   # 0.9843039832986613
    'session_id': 112,   #0.9846
    'device': 'cuda'
}

# Initialize the XGBClassifier with the defined parameters
xgb_model = XGBClassifier(**params)

# Fit the model to the training data 
xgb_model.fit(X_train, y_train)

# Predict on the test data 
y_pred = xgb_model.predict(X_test)

# Evaluate the model using Matthews correlation coefficient
mcc = matthews_corrcoef(y_test, y_pred)
print("Matthews Correlation Coefficient:", mcc)

Matthews Correlation Coefficient: 0.9843448054199812


In [23]:
print(df_train['class'].value_counts())
print(df_train['class'].value_counts() / len(df_train))

class
p    1705396
e    1411549
Name: count, dtype: int64
class
p    0.547137
e    0.452863
Name: count, dtype: float64


In [24]:
result = pd.DataFrame(y_pred)
print("result of train: ", result[0].value_counts())
print(result[0].value_counts() / len(y_pred))

result of train:  0
1    317731
0    274490
Name: count, dtype: int64
0
1    0.536507
0    0.463493
Name: count, dtype: float64


In [25]:
test_preds = xgb_model.predict(df_test_preprocessed)
test_preds = label_encoder.inverse_transform(test_preds)

In [26]:
result = pd.DataFrame(df_test['id'])
result['class'] = test_preds
print("result of train: ", result['class'].value_counts())
print(result['class'].value_counts() / len(test_preds))

result of train:  class
p    1133373
e     944591
Name: count, dtype: int64
class
p    0.545425
e    0.454575
Name: count, dtype: float64


In [27]:
result['class']

0          e
1          p
2          p
3          p
4          e
          ..
2077959    p
2077960    p
2077961    p
2077962    e
2077963    e
Name: class, Length: 2077964, dtype: object

In [28]:
result = pd.DataFrame(df_test['id'])
result['class'] = test_preds
print("result of train: ", result['class'].value_counts())
print(result['class'].value_counts() / len(test_preds))

result of train:  class
p    1133373
e     944591
Name: count, dtype: int64
class
p    0.545425
e    0.454575
Name: count, dtype: float64


In [None]:
output = pd.DataFrame({'id': df_test['id'],
                       'class': test_preds})

output.to_csv('submission.csv', index=False)

output.head()

In [None]:
result.to_csv('submission.csv', index =False)