In [None]:
import sys, matplotlib, pandas, sklearn, seaborn

print(f'Python: {sys.version}')
print(f'matplotlib: {matplotlib.__version__}')
print(f'pandas: {pandas.__version__}')
print(f'sklearn: {sklearn.__version__}')
print(f'seaborn: {seaborn.__version__}')

In [None]:
!pip install xgboost
!pip install lightgbm
!pip install imbalanced-learn

In [3]:
# Import libraries

# Data Manipulation
import numpy as np
import pandas as pd
from   pandas import DataFrame

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from   sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from   sklearn.impute import SimpleImputer
from   sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.metrics import RocCurveDisplay
from   sklearn.linear_model import LogisticRegression
from   sklearn.tree import DecisionTreeClassifier
from   sklearn.ensemble import RandomForestClassifier
from   xgboost import XGBClassifier
from   lightgbm import LGBMClassifier
from   imblearn.over_sampling import RandomOverSampler
import pickle

# Maths
import math

# Set pandas options to show more rows and columns
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
%matplotlib inline

In [4]:
df = pd.read_csv("Churn_Modelling_m.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
!pip install datatile

In [None]:
from datatile.summary.df import DataFrameSummary
# Test with first few rows
try:
    dfs_small = DataFrameSummary(df.head(100))
    print("Success with a smaller subset")
except Exception as e:
    print("Error with a smaller subset:", str(e))

# Test with specific columns
try:
    dfs_numeric = DataFrameSummary(df.select_dtypes(include=[np.number]))
    print("Success with numeric columns only")
except Exception as e:
    print("Error with numeric columns:", str(e))



In [None]:
# Convert object columns that should be categorical
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

# Convert strings to numbers where applicable
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='coerce')  # 'coerce' turns errors into NaNs

try:
    dfs = DataFrameSummary(df)
    print("Initialization successful after type conversion.")
except Exception as e:
    print("Still failing after type conversion:", str(e))


In [None]:
# Exclude categorical columns explicitly
df_numeric = df.select_dtypes(include=[np.number])

# Attempt to initialize DataFrameSummary with only numeric columns
try:
    dfs_numeric = DataFrameSummary(df_numeric)
    print("Successfully initialized DataFrameSummary with numeric columns only.")
except Exception as e:
    print("Error initializing with numeric columns:", e)


In [None]:
# Convert categorical columns to 'category' data type
df['Gender'] = df['Gender'].astype('category')
df['Geography'] = df['Geography'].astype('category')  # Assuming 'Geography' is also categorical

try:
    dfs = DataFrameSummary(df)
    print("DataFrameSummary initialized successfully with categorical conversions.")
except Exception as e:
    print("Error after converting to categorical:", e)


In [None]:
# Test with only numeric columns and one categorical column explicitly marked
test_df = df[['CreditScore', 'Age', 'Gender']]
test_df['Gender'] = test_df['Gender'].astype('category')

try:
    dfs_test = DataFrameSummary(test_df)
    print("Initialization successful with a small subset of columns.")
except Exception as e:
    print("Failed with a small subset of columns:", e)


In [None]:
!pip install pandas_profiling



In [None]:
!conda install -c conda-forge pandas_profiling

In [None]:
!pip install pydantic-settings

In [None]:
!pip install --upgrade pip
!pip install pydantic --upgrade
!pip install pandas_profiling --upgrade

In [None]:
!pip install "pydantic<2.0"

In [None]:
try:
    from pandas_profiling import ProfileReport
    print("pandas_profiling is ready to use.")
except Exception as e:
    print("Failed to import pandas_profiling after downgrading pydantic:", e)


In [2]:
from ydata_profiling import ProfileReport

In [3]:
import numpy as np
import pandas as pd
from   pandas import DataFrame

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from   sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from   sklearn.impute import SimpleImputer
from   sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.metrics import RocCurveDisplay
from   sklearn.linear_model import LogisticRegression
from   sklearn.tree import DecisionTreeClassifier
from   sklearn.ensemble import RandomForestClassifier
from   xgboost import XGBClassifier
from   lightgbm import LGBMClassifier
from   imblearn.over_sampling import RandomOverSampler
import pickle

# Maths
import math

# Set pandas options to show more rows and columns
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
%matplotlib inline

In [4]:
df = pd.read_csv("Churn_Modelling_m.csv")

In [None]:
# Assuming 'df' is your DataFrame
profile = ProfileReport(df, title='Data Profiling Report', explorative=True)
profile.to_widgets()  # Display the report within Jupyter or Google Colab

In [7]:
# Input file name with path
input_file_name = 'Churn_Modelling_m.csv'

# Target class name
input_target_class = "Exited"

# Columns to be removed
input_drop_col = "CustomerId"

# Col datatype selection
input_datatype_selection = 'auto'  # use auto if you don't want to provide column names by data type else use 'manual'

# Categorical columns
input_cat_columns = ['Surname', 'Geography', 'Gender', 'Gender', 'HasCrCard', 'IsActiveMember', 'Exited']

# Numerical columns
input_num_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

# Encoding technique
input_encoding = 'LabelEncoder' # choose the encoding technique from 'LabelEncoder', 'OneHotEncoder', 'OrdinalEncoder' and 'FrequencyEncoder'

# Handle missing value
input_treat_missing_value = 'drop' # choose how to handle missing values from 'drop','inpute' and 'ignore'

# Machine learning algorithm
input_ml_algo = 'RandomForestClassifier' # choose the ML algorithm from 'LogisiticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'XGBClassifier' and LGBMClassifier'

In [None]:
df.memory_usage(deep=True).sum() / 1024**2

In [9]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':  # for integers
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:  # for floats.
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df_o   = reduce_mem_usage(df)

In [None]:
df_o.info()

In [None]:
df.memory_usage(deep=True).sum() / 1024**2

In [None]:
# Check distribution of target class
sns.countplot(y=df[input_target_class] ,data=df)
plt.xlabel("Count of each Target class")
plt.ylabel("Target classes")
plt.show()

In [None]:
print(df['Exited'].value_counts())

In [None]:
# Check the distribution of all the features
df.hist(figsize=(15,12),bins = 15)
plt.title("Features Distribution")
plt.show()

In [16]:
n_cols = 3
n_rows = math.ceil(len(input_num_columns)/n_cols)

In [17]:
 sns.set(font_scale=2)

In [None]:
# Check the distribution of y variable corresponding to every x variable
fig,ax = plt.subplots(nrows = n_rows, ncols = n_cols, figsize=(30,30))
row = 0
col = 0
for i in input_num_columns:
    if col > 2:
        row += 1
        col = 0
    axes = ax[row,col]
    sns.boxplot(x = df[input_target_class], y = df[i], ax = axes)
    col += 1
plt.tight_layout()
plt.title("Individual Features by Class")
plt.show()

In [None]:
!pip install joypy

In [None]:
!python -c "import joypy; print(joypy.__version__)"

In [None]:
import joypy
varbls = ['Age','Tenure','CreditScore','Balance', 'EstimatedSalary']

plt.figure(figsize=(10,2), dpi= 80)
for i,var in enumerate(varbls):
    joypy.joyplot(df, column=[var], by="Exited", ylim='own', figsize=(16,5), color=['tomato', 'purple']);
    plt.title(f"{var} by 'Exited'", fontsize=22)
plt.show()

In [22]:
sns.set(font_scale=1)

In [None]:
# pairplot with seaborn library
plt.figure(figsize=(10,8), dpi= 80)
sns.pairplot(df.loc[:, ['Exited', 'CreditScore', 'Tenure', 'Age', 'Balance']],
             kind="scatter", hue="Exited", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()


In [None]:
plt.figure(figsize=(10,8), dpi= 80)
sns.pairplot(df.loc[:, ['Exited', 'CreditScore', 'Tenure', 'Age', 'Balance']],
             kind="reg", hue="Exited")
plt.show()

In [None]:
!pip install missingno

In [26]:
import missingno as msno # to visualize missing value

In [None]:
msno.matrix(df)

In [None]:
msno.heatmap(df)

In [None]:
df[df['Age'].notnull()].head()

In [None]:
df_app1 = df[(df['Age'].notnull()) & df['Balance'].notnull()]
df_app1.shape

In [None]:
df["Balance_0"] = df["Balance"].fillna(0)
df[["Surname", "CreditScore", "Geography", "Gender", "Age", "Tenure", "Balance", "Balance_0"]].head()

In [None]:
df.Gender.value_counts()

In [None]:
most_frequent = df.Gender.value_counts().index[0]
most_frequent

In [None]:
df['Gender'].fillna(most_frequent).head(10)

In [None]:
df['Gender'].fillna("Empty")

In [None]:
mean_val = df['Balance'].mean()
df['Balance'].fillna(mean_val).head(10)

In [None]:
# Mean value by group
df['Balance_by_Class'] = df.groupby('Gender')['Balance'].transform(lambda x: x.mean())
df[['Gender', 'Balance', 'Balance_by_Class']].head(10)

In [None]:
df['Balance_imputed'] = np.where(np.isnan(df['Balance']), df['Balance_by_Class'], df['Balance'])
df.head(10)