# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

Matplotlib is building the font cache; this may take a moment.


About Data

In [5]:

data = pd.read_csv("bank-additional.csv", delimiter=';')

data.info()
data.describe()

FileNotFoundError: [Errno 2] No such file or directory: 'bank-additional.csv'

1.Data Cleaning

In [None]:
column_order = ["age", "job", "marital", "education", "default", "housing", "loan",
                "contact", "month", "day_of_week", "duration", "campaign", "pdays",
                "previous", "poutcome", "emp.var.rate", "cons.price.idx",
                "cons.conf.idx", "euribor3m", "nr.employed", "y"]

# Read the CSV file with the specified column order
bank_additional = pd.read_csv("bank-additional.csv", delimiter=';', names=column_order)

In [None]:
print(bank_additional.shape)


In [None]:
print(bank_additional.head())

In [None]:
print(data.isnull().sum())


In [None]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
data[categorical_columns] = data[categorical_columns].astype('category')

In [None]:
reference_column = 'age'
numerical_columns = ['duration', 'campaign', 'pdays', 'previous',
                     'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                     'euribor3m', 'nr.employed']
fig, axes = plt.subplots(3, 4, figsize=(18, 12))
axes = axes.flatten()

for i, column in enumerate(numerical_columns):
    sns.scatterplot(data=data, x=reference_column, y=column, ax=axes[i], color="purple")
    axes[i].set_title(f'{reference_column} vs {column}', fontsize=12, color='darkblue')
    axes[i].set_xlabel(reference_column, fontsize=10)
    axes[i].set_ylabel(column, fontsize=10)

for j in range(len(numerical_columns), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:

numerical_columns = ['duration', 'campaign', 'pdays', 'previous',
                     'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                     'euribor3m', 'nr.employed']

fig, axes = plt.subplots(3, 4, figsize=(18, 12))
axes = axes.flatten()

for i, column in enumerate(numerical_columns):
    sns.boxplot(
        data=data,
        y=column,
        ax=axes[i],
        boxprops=dict(facecolor="purple", edgecolor="purple"),
        medianprops=dict(color="darkblue", linewidth=2)
    )
    axes[i].set_title(f'Distribution of {column}', fontsize=12, color='darkblue')
    axes[i].set_ylabel(column, fontsize=10)
    axes[i].grid(axis='y', linestyle='--', alpha=0.7)

for j in range(len(numerical_columns), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
# Cap the values in the 'campaign' and 'duration' columns at the 95th percentile
for column in ['campaign', 'duration']:
    threshold = data[column].quantile(0.90)
    data[column] = np.where(data[column] > threshold, threshold, data[column])

# Remove any rows where the 'age' column has values greater than 90
data = data[data['age'] <= 90]

# Display the first few rows of the modified DataFrame to verify changes
print(data.head())


2.Data Preprocessing

In [None]:
scaler = MinMaxScaler()

# List of numerical columns to scale (update as needed)
numerical_cols = ['age', 'duration', 'campaign', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Apply Min-Max Scaling
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Print the first few rows to check the transformed data
data.head()

In [None]:
# Convert the 'pdays' column into a binary feature: 0 indicates not contacted (999), and 1 indicates contacted
data['contacted_before'] = data['pdays'].apply(lambda x: 0 if x == 999 else 1)

# Remove the original 'pdays' column from the DataFrame
data.drop('pdays', axis=1, inplace=True)

# Display the first few rows of the new 'contacted_before' column to confirm the changes
print(data[['contacted_before']].head())

In [None]:
# Define the list of categorical columns for one-hot encoding
categorical_columns = ['job', 'marital', 'education', 'default', 'housing',
                       'loan', 'contact', 'month', 'day_of_week', 'poutcome']

# Identify the columns that are present in the DataFrame
existing_columns = [col for col in categorical_columns if col in data.columns]

# Display missing columns for troubleshooting
missing_columns = [col for col in categorical_columns if col not in data.columns]
if missing_columns:
    print(f"The following columns are not found in the DataFrame and will be skipped: {missing_columns}")

# Perform one-hot encoding on the available columns
if existing_columns:
    data = pd.get_dummies(data, columns=existing_columns)
    print("One-hot encoding has been successfully completed.")
else:
    print("No valid categorical columns found for one-hot encoding.")

# Display the updated DataFrame structure
print(data.head())
