# 1. Handling Categorical Values

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load the dataset
df = pd.read_csv("tugas3_genap.csv")

# Display the first few rows to inspect the data
print("First few rows of the dataset:")
print(df.head())

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {categorical_columns}")

# Check if required columns exist
required_columns = ['education', 'job', 'marital', 'age']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Warning: The following required columns are missing: {missing_columns}")

# Apply different encoding methods to 3 columns:

# 1. Label Encoding on one column (example: 'education')
if 'education' in df.columns:
    label_encoder = LabelEncoder()
    df['education_encoded'] = label_encoder.fit_transform(df['education'])

# 2. One-Hot Encoding on 'job' (if the column exists)
if 'job' in df.columns:
    df = pd.get_dummies(df, columns=['job'], drop_first=True)

# 3. Ordinal Encoding on 'marital' (if the column exists)
if 'marital' in df.columns:
    ordinal_encoder = OrdinalEncoder(categories=[['single', 'married', 'divorced']])
    df['marital_encoded'] = ordinal_encoder.fit_transform(df[['marital']])

# Binning the 'age' column into 4 groups if it exists
if 'age' in df.columns:
    bins = [0, 25, 40, 60, 100]
    labels = ['Muda', 'Dewasa', 'Paruh Baya', 'Lanjut Usia']
    df['age_binned'] = pd.cut(df['age'], bins=bins, labels=labels)

# View the changes - we will display only relevant columns that were modified or newly created
columns_to_display = ['education', 'education_encoded', 'job', 'marital', 'marital_encoded', 'age', 'age_binned']
# Check if these columns exist in the dataframe before attempting to display them
existing_columns = [col for col in columns_to_display if col in df.columns]
print("Displaying the modified dataset with relevant columns:")
print(df[existing_columns].head())


First few rows of the dataset:
   age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  campaign  pdays  previous poutcome deposit  
0    5   may      1042         1     -1         0  unknown     yes  
1    5   may      1467         1     -1         0  unknown     yes  
2    5   may      1389         1     -1         0  unknown     yes  
3    5   may       579         1     -1         0  unknown     yes  
4    5   may       673         2     -1         0  unknown     yes  
Categorical columns: Index(['job', 'marital', 'education', 'default'

# 2. Data Normalization

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

# Min-Max Scaling on 'balance' column
scaler_min_max = MinMaxScaler()
df['balance_scaled'] = scaler_min_max.fit_transform(df[['balance']])

# Z-Score scaling on 'duration' column
scaler_zscore = StandardScaler()
df['duration_scaled'] = scaler_zscore.fit_transform(df[['duration']])

# Decimal scaling on 'campaign' column
df['campaign_scaled'] = df['campaign'] / (10 ** np.ceil(np.log10(df['campaign'].max())))

# View the changes
df[['balance', 'balance_scaled', 'duration', 'duration_scaled', 'campaign', 'campaign_scaled']].head()


Unnamed: 0,balance,balance_scaled,duration,duration_scaled,campaign,campaign_scaled
0,2343,0.104371,1042,1.930226,1,0.01
1,45,0.078273,1467,3.154612,1,0.01
2,1270,0.092185,1389,2.929901,1,0.01
3,2476,0.105882,579,0.596366,1,0.01
4,184,0.079851,673,0.867171,2,0.02


# 3. Dimensionality Reduction

In [19]:
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming df is already loaded

# Check the columns in the dataset
print("Columns in the dataset:", df.columns)

# Encode the target column 'deposit' to numeric values
label_encoder = LabelEncoder()
df['deposit_encoded'] = label_encoder.fit_transform(df['deposit'])

# Remove non-numeric columns (e.g., categorical columns like 'marital', 'job', etc.)
df_numeric = df.select_dtypes(include=['float64', 'int64'])

# Feature Selection based on correlation with the target (now 'deposit_encoded')
correlation_matrix = df_numeric.corr()
target_corr = correlation_matrix['deposit_encoded'].abs().sort_values(ascending=False)

# Lowering the correlation threshold to select more features
high_corr_features = target_corr[target_corr > 0.2].index.tolist()  # Adjusted to 0.2 to select more features

# Print selected features
print(f"Features highly correlated with target: {high_corr_features}")

# Check how many features were selected
print(f"Number of features selected: {len(high_corr_features)}")

# Ensure at least 5 features are selected, otherwise, adjust the threshold or manually add more features
if len(high_corr_features) < 5:
    print("Selected fewer than 5 features. Manually adding features to reach 5.")
    # Manually add more features to ensure we have 5 (e.g., using domain knowledge or more features from df)
    additional_features = ['balance', 'age', 'pdays']  # Example, add more features if necessary
    high_corr_features += additional_features[:5 - len(high_corr_features)]

# Now that we have at least 5 features, apply PCA
if len(high_corr_features) >= 5:
    # Feature Extraction using PCA (reduce to 5 features)
    pca = PCA(n_components=5)
    df_pca = pca.fit_transform(df[high_corr_features])

    # Convert PCA output to DataFrame
    df_pca = pd.DataFrame(df_pca, columns=[f'PC{i+1}' for i in range(5)])

    # Display the reduced features using pandas
    print(df_pca.head())  # Display the PCA features in the console
else:
    print("Not enough features for PCA. Feature selection might need adjustment.")


Columns in the dataset: Index(['age', 'marital', 'education', 'default', 'balance', 'housing', 'loan',
       'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'deposit', 'education_encoded', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_encoded', 'age_binned',
       'balance_scaled', 'duration_scaled', 'campaign_scaled',
       'deposit_encoded'],
      dtype='object')
Features highly correlated with target: ['deposit_encoded', 'duration', 'duration_scaled']
Number of features selected: 3
Selected fewer than 5 features. Manually adding features to reach 5.
           PC1          PC2        PC3       PC4           PC5
0   816.103135   668.016072  17.483816  0.063236 -1.054268e-12
1 -1480.852807  1098.630504  15.471225 -0.184509 -2.023937e-12
2  -256.053339  1017.639094  

# 4. Data Splitting

In [21]:
from sklearn.model_selection import train_test_split

# Check the column names in the dataset to ensure 'deposit' is the target column
print(df.columns)

# Update the target column name if necessary
X = df.drop(columns=['deposit'])  # Replace 'deposit' with the correct target column name
y = df['deposit']  # Use the correct target column name here

# Stratify ensures that the class distribution in the target column is preserved
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

# Display the shape of each split
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")


Index(['age', 'marital', 'education', 'default', 'balance', 'housing', 'loan',
       'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'deposit', 'education_encoded', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_encoded', 'age_binned',
       'balance_scaled', 'duration_scaled', 'campaign_scaled',
       'deposit_encoded'],
      dtype='object')
Train shape: (7813, 33), Validation shape: (1674, 33), Test shape: (1675, 33)
