# Data Screening

**Data Description:**

In [None]:
import pandas as pd
from termcolor import cprint


data = pd.read_csv('data/data_nasabah.csv', delimiter=';')

cprint(f"Data (rows, columns): {data.shape}", "green")  # Show the shape of the DataFrame (rows, columns)
cprint("\nInformation data types and encoding", "blue")  # Display data types and encoding scheme
print(data.info())

cprint("\nData descriptive statistics for numerical data:", "blue")  # Display descriptive statistics for numerical data
print(data.describe())

# Unique values for each column
for column in data.columns:
    print(f"{column}: {data[column].nunique()} unique values")

# Ensure categorical columns are encoded numerically
data['jenis_kelamin'] = data['jenis_kelamin'].map({'Laki-Laki': 1, 'Perempuan': 2})
data['jenis_produk'] = data['jenis_produk'].map({'tabungan': 1, 'kartu_kredit': 2, 'deposito': 3})
data['pengguna_mobile_banking'] = data['pengguna_mobile_banking'].map({'YA': 1, 'TIDAK': 2})

cprint("\nShow DataFrame", "blue")  # Display the first few rows of the DataFrame
print(data.head())

**Data Exploration:**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Analysis Distribution of Numerical Data
cprint('\nAnalisis Distribusi Data Numerik:', 'blue')
numerical_features = data.select_dtypes(include=['number']).columns
for feature in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(data[feature], kde=True)
    plt.title(f"Distribusi {feature}")
    plt.show()

# Analysis Correlation Between Numerical Variables
cprint('\nAnalisis Korelasi Antara Variabel Numerik:', 'blue')
numerical_data = data.select_dtypes(include=['number'])
correlation_matrix = numerical_data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Matriks Korelasi')
plt.show()

# Analysis data categorical
cprint('\nAnalisis Data Kategorikal:', 'blue')
categorical_features = data.select_dtypes(include=['object']).columns
for feature in categorical_features:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=feature, data=data)
    plt.title(f"Jumlah kemunculan {feature}")
    plt.xticks(rotation=45)
    plt.show()

# Analysys missing values
cprint('\nAnalisis Nilai yang Hilang:', 'blue')
missing_values = data.isnull().sum()
print('Jumlah nilai yang hilang untuk setiap kolom:')
print(missing_values)

# Analysis outliers
cprint('\nAnalisis Outlier:', 'blue')
for feature in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=data[feature])
    plt.title(f"Deteksi Outlier pada {feature}")
    plt.show()

# Display distribution for specific columns e.g. 'umur'
cprint('\nDistribusi Umur:', 'blue')
plt.figure(figsize=(8, 6))
sns.histplot(data['umur'], kde=True)
plt.title('Distribusi Umur')
plt.xlabel('Umur')
plt.ylabel('Frekuensi')
plt.show()

# Analysis of the relationship between e.g. 'umur' and 'pendapatan'
cprint('\nAnalisis Hubungan antara Umur dan Pendapatan:', 'blue')
plt.figure(figsize=(8, 6))
sns.scatterplot(x='umur', y='pendapatan', data=data)
plt.title('Hubungan antara Umur dan Pendapatan')
plt.xlabel('Umur')
plt.ylabel('Pendapatan')
plt.show()

# Display distribution for column 'Jenis Kelamin'
cprint('\nDistribusi Jenis Kelamin:', 'blue')
plt.figure(figsize=(8, 6))
sns.countplot(x='jenis_kelamin', data=data)
plt.title('Distribusi Jenis Kelamin')
plt.xlabel('Jenis Kelamin')
plt.ylabel('Frekuensi')
plt.show()

# Display relationship between 'jenis_kelamin' and 'pendapatan'
cprint('\nHubungan antara Jenis Kelamin dan Pendapatan:', 'blue')
plt.figure(figsize=(8, 6))
sns.boxplot(x='jenis_kelamin', y='pendapatan', data=data)
plt.title('Hubungan antara Jenis Kelamin dan Pendapatan')
plt.xlabel('Jenis Kelamin')
plt.ylabel('Pendapatan')
plt.show()
