In [None]:
import pandas as pd
import numpy as np 
from scipy import stats
import seaborn as sns 
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef

palette = sns.color_palette("Spectral", n_colors=13) 
sns.set_theme(context='notebook', palette=palette, style='darkgrid')

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

df_train.head()

In [None]:
# Calculate the percentage of missing values for each column in the training set
#missing_train = df_train.isnull().median() # 50% 이상이 NaN인 칼럼 제거
missing_train = df_train.isnull().mean() # 90% 이상이 NaN인 칼럼 제거 

# Identify columns with more than 90% missing values in the training set
columns_to_drop = missing_train[missing_train > 0.90].index

# Drop the identified columns from both datasets
df_train_cleaned = df_train.drop(columns_to_drop, axis=1)
df_test_cleaned = df_test.drop(columns_to_drop, axis=1)

In [None]:
# Drop 'id' column
df_train_cleaned = df_train_cleaned.drop(['id'], axis=1)

# Define the target column
target_column = 'class'

# Select categorical columns, excluding the target column
categorical_columns = df_train_cleaned.select_dtypes(include=['object']).columns.drop(target_column)

# Select numerical columns, excluding the target column if it's numerical
numerical_columns = df_train_cleaned.select_dtypes(exclude=['object']).columns.drop(target_column, errors='ignore')

In [None]:
# Define a function to identify and replace infrequent categories
def replace_infrequent_categories(df, column, threshold=70):
    # Calculate value counts for the column
    value_counts = df[column].value_counts()

    # Determine infrequent categories (those occurring less than or equal to the threshold)
    infrequent = value_counts[value_counts <= threshold].index
    print(value_counts[value_counts <= threshold].index)
    # Replace infrequent categories with the mode
    df[column] = df[column].apply(lambda x: "Unknown" if x in infrequent else x)

    return df

# Handle invalid values and infrequent categories for all categorical columns
for col in categorical_columns:
    df_train_cleaned = replace_infrequent_categories(df_train_cleaned, col)
    df_test_cleaned = replace_infrequent_categories(df_test_cleaned, col)

In [None]:
df_train_cleaned[numerical_columns].apply(lambda x: stats.skew(x.dropna()))

In [None]:
# Compute medians for numerical columns in the training set
medians = df_train_cleaned[numerical_columns].median()

# Fill missing values in the training and testing sets
df_train_cleaned[numerical_columns] = df_train_cleaned[numerical_columns].fillna(medians)
df_test_cleaned[numerical_columns] = df_test_cleaned[numerical_columns].fillna(medians)

In [None]:
# Impute any missing values with 'Unknown'
df_train_cleaned = df_train_cleaned.fillna("Unknown")
df_test_cleaned = df_test_cleaned.fillna("Unknown")

In [None]:
df_train_cleaned = df_train_cleaned.drop_duplicates()

In [None]:
# Calculate the Z-scores for the numerical columns in the DataFrame
z_scores = stats.zscore(df_train_cleaned[numerical_columns])

# Generate descriptive statistics for the Z-scores and round the results to 3 decimal places
z_scores.describe().round(3)

In [None]:
numerical_df = df_train_cleaned[numerical_columns]

# Create a DataFrame for Z-scores
z_scores_df = pd.DataFrame(z_scores, columns=numerical_df.columns)

# Define the threshold
threshold = 3

# Filter out rows with any Z-score above the threshold in numerical columns
df_train_no_outliers = df_train_cleaned[(z_scores_df.abs() < threshold).all(axis=1)]

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
train_encoded_target = label_encoder.fit_transform(df_train_no_outliers[['class']])

# Convert categorical columns to 'category' dtype 
df_train_no_outliers[categorical_columns] = df_train_no_outliers[categorical_columns].astype('category')
df_test_cleaned[categorical_columns] = df_test_cleaned[categorical_columns].astype('category')

# Define the numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

# Define the categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', dtype=np.int32, handle_unknown='ignore'))
])

# Combine both numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)

# Apply the transformations using the pipeline
df_train_encoded = preprocessor.fit_transform(df_train_no_outliers)
df_test_encoded = preprocessor.transform(df_test_cleaned)

# Ensure outputs are dense arrays
train_encoded_dense = df_train_encoded.toarray()
test_encoded_dense = df_test_encoded.toarray()

# Get feature names
numerical_feature_names = numerical_columns  # Assuming numerical columns do not change names
categorical_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_columns)

# Combine the feature names
all_feature_names = np.concatenate([numerical_feature_names, categorical_feature_names])

# Debugging: Print the number of feature names
print("Number of Features:", len(all_feature_names))

# Convert the transformed dense arrays back into DataFrames
train_encoded_df = pd.DataFrame(train_encoded_dense, columns=all_feature_names)
test_encoded_df = pd.DataFrame(test_encoded_dense, columns=all_feature_names)

In [None]:
# Separate features (X) and target variable (y)
X = train_encoded_df
y = train_encoded_target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
#duplicate
params = {
    'colsample_bytree': 0.4, 
    'learning_rate': 0.01, 
    'max_depth': 14, 
    'min_child_weight': 1, 
    'n_estimators': 3000, 
    'subsample': 0.9,
    #'lambda': 0.3,
    'use_label_encoder': False,  
    'eval_metric': 'mlogloss',   #0.9849805081808766
    'device': 'cuda'
}

# Initialize the XGBClassifier with the defined parameters
xgb_model = XGBClassifier(**params)

# Fit the model to the training data 
xgb_model.fit(X_train, y_train)

# Predict on the test data 
y_pred = xgb_model.predict(X_test)

# Evaluate the model using Matthews correlation coefficient
mcc = matthews_corrcoef(y_test, y_pred)
print("Matthews Correlation Coefficient:", mcc)

In [None]:
result = pd.DataFrame(y_pred)
print("result of train: ", result[0].value_counts())
print(result[0].value_counts() / len(y_pred))
print()
print(df_train['class'].value_counts() / len(df_train))

In [None]:
prediction = xgb_model.predict(test_encoded_df)
prediction = label_encoder.inverse_transform(prediction)

In [None]:
result = pd.DataFrame({'id': df_test['id'],
                       'class': prediction})
print("result of train: ", result['class'].value_counts())
print(result['class'].value_counts() / len(result))

In [None]:
result.to_csv('submission.csv', index=False)