In [492]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import warnings 
warnings.filterwarnings("ignore")

In [493]:
train = pd.read_csv('Train-Set.csv').iloc[:,1:]
test =  pd.read_csv('Test-Set.csv').iloc[:,1:]

In [494]:
train.shape

(54712, 18)

In [495]:
# Downsampling function
# Downsample the input 'DataFrame' by balancing the class distribution.
def returnbalanceData(data,target,min_class):
    
    classes = dict(data[target].value_counts())
    data.reindex()
    index = np.array([])
    for clas in classes:
        class_index = data[data[target] == clas].index
        sample=list(np.random.choice(class_index,classes[min_class],replace=False))
        
        index = np.concatenate((sample,index))
        
    return data.iloc[index,:]

In [496]:
train = returnbalanceData(train,'Target','yes')

In [497]:
train.Target.value_counts()

yes    6279
no     6279
Name: Target, dtype: int64

In [498]:
# Merge the datasets  
train['t_type'] = 'train'
test['t_type'] = 'test'

df = train.append(test, ignore_index=True)

In [499]:
df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target,t_type
0,66294,43,unemployed,married,primary,no,519.0,no,no,unknown,20,jun,973,2,-1,0,unknown,yes,train
1,46643,26,student,single,unknown,no,689.0,no,no,cellular,25,may,678,2,-1,0,unknown,yes,train
2,71520,33,admin.,married,secondary,no,616.0,no,no,cellular,6,sep,250,3,-1,0,unknown,yes,train
3,65327,45,blue-collar,divorced,secondary,no,-184.0,yes,no,cellular,7,may,878,3,170,5,failure,yes,train
4,24962,63,retired,married,secondary,no,474.0,no,no,cellular,25,jan,423,1,-1,0,unknown,yes,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36002,902,36,blue-collar,married,basic.6y,no,,no,no,telephone,may,fri,203,5,999,0,nonexistent,,test
36003,39745,44,services,married,secondary,no,133.0,no,no,cellular,29,jan,202,3,-1,0,unknown,,test
36004,40150,31,blue-collar,married,basic.9y,no,,yes,no,cellular,jul,wed,590,1,999,0,nonexistent,,test
36005,6012,43,management,divorced,high.school,no,,yes,no,cellular,nov,mon,488,2,999,0,nonexistent,,test


## Dealing with null values first


In [500]:
jobs_salary = dict(df.balance.groupby(df.job).mean())
jobs_salary.pop('unknown')
for job in jobs_salary:
    df.loc[df.job == job,'balance'] = df.loc[df.job == job,'balance'].fillna(jobs_salary[job])
df.balance.fillna(df.balance.mean(),inplace=True)

### Filling null values in balance column by take the mean for each job and fill it by its job.
----
# education


In [501]:
df.loc[df['education'] == 'professional.course', 'education'] = 'tertiary'
df.loc[df['education'] == 'basic.6y', 'education'] = 'secondary'
df.loc[df['education'] == 'basic.9y', 'education'] = 'secondary'
df.loc[df['education'] == 'basic.4y', 'education'] = 'primary'
df.loc[df['education'] == 'high.school', 'education'] = 'secondary'
df.loc[df['education'] == 'university.degree', 'education'] = 'tertiary'


# Education 

In [502]:
df.loc[(df.age <= 20) & (df.marital == 'unknown') ,'education'] = 'primary'
df.loc[(df.age <= 30) & (df.marital == 'unknown') ,'education'] = 'secondary'
df.loc[(df.age > 30) & (df.marital == 'unknown') ,'education'] = 'tertiary'

# marital 
### fill unknown values by:
 1- If age is less than **30** after that marital status will be **single**.
 
 2- If Age is greater than **30** and have housing is yes marital status will be **married**.
 
 3- else marital status will be **married**.

In [503]:
df.loc[(df.age < 30) & (df.marital == 'unknown'),'marital'] = 'single'
df.loc[(df.age > 30) & (df.housing == 'yes') & (df.marital == 'unknown') ,'marital'] = 'married'
df.loc[(df.marital == 'unknown'),'marital'] = 'married'

## Feature Engineering:

I do not care about if the customer has housing loan or other I am care about how many loan. so we can make feature engineering between two columns housing and loan as following:

1- if housing = yes and loan= no *or* housing = no and loan= yes *or* housing = yes and loan= unkown *or* housing = unknown and loan= yes  then number of loans are 1.

2- if housing = no and loan= no *or* housing = unknown and loan= no *or* housing = no and loan= unknown then number of loans are 0.

3- if housing = yes and loan= yes then number of loans are 2.

4- if housing = unkown and loan = unknown then number of loans are -1
# Housing & loans

In [504]:
df.loc[(df['housing'] == 'yes') & (df['loan'] == 'yes'), 'number_of_loan'] = 2
df.loc[(df['housing'] == 'yes') & (df['loan'] == 'no'), 'number_of_loan'] = 1
df.loc[(df['housing'] == 'yes') & (df['loan'] == 'unknown'), 'number_of_loan'] = 1
df.loc[(df['housing'] == 'no') & (df['loan'] == 'yes'), 'number_of_loan'] = 1
df.loc[(df['housing'] == 'no') & (df['loan'] == 'no'), 'number_of_loan'] = 0
df.loc[(df['housing'] == 'no') & (df['loan'] == 'unknown'), 'number_of_loan'] = 0
df.loc[(df['housing'] == 'yes') & (df['loan'] == 'unknown'), 'number_of_loan'] =1
df.loc[(df['housing'] == 'unknown') & (df['loan'] == 'no'), 'number_of_loan'] =0
df.loc[(df['housing'] == 'unknown') & (df['loan'] == 'unknown'), 'number_of_loan'] =-1

# 🚸<span style="color:red">Dangerous</span>.🚸

## 🚩 The following code should run one time and only one.
### the main point of this code is shift cells between month and day columns..

# Days & Month

In [505]:

outDays = [ x for x in df.day.unique() if not x.isdigit()]
for i in outDays:
    condition = df.day == i
    temp = df.loc[condition,'day']
    
    df.loc[condition,'day'] = df.loc[condition,'month']
    df.loc[condition,'month'] = temp

In [506]:
NotNumaricDays = [ x for x in df.day.unique() if not x.isdigit()]
print(NotNumaricDays)
for day in range(len(NotNumaricDays)):
    df.day.replace(NotNumaricDays[day],str(day+1),inplace=True)

['thu', 'fri', 'wed', 'mon', 'tue']


### Hint!!! when number of loans is -1 the majourty class is **NO**
# Age

In [507]:
def ageRanges(age):
    if age <= 24:
        return 'Teenager'
    elif age <=60:
        return 'middle-aged'
    return 'senior'

In [508]:
df['age'] = df.age.apply(lambda x:ageRanges(int(x)))

In [509]:
df.drop(labels=['housing','loan','duration','day'],axis=1,inplace=True)#duration

# pDays 
 - if pdays => 1 value will be called
 
 - if pdays == -1 value will be not-called

In [510]:
def pdayConverter(pday):
    if pday >0:
        return 'called'
    return 'not-called'
df['pdays'] =df.pdays.apply(lambda x:pdayConverter(x))

# Job
- replace each unknown cell with the major job in job column 

In [511]:
df.replace('unknown',pd.Series.mode(df.job),inplace=True)

# Modeling 

In [512]:
from sklearn.metrics import f1_score,classification_report,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTETomek
import xgboost as xgb
#!pip install xgboost


In [513]:
# Encoder
for col in df.columns:
    L = LabelEncoder()
    if col not in ['id','balance','campaign','previous','Target','t_type','number_of_loan']:
        df[col] = L.fit_transform(df[col])

    

In [514]:
train = df[df.t_type == 'train'][['age','job','marital','education','default','balance','contact','month','campaign','pdays','previous','poutcome','number_of_loan','Target']]
test = df[df.t_type == 'test'][['id','age','job','marital','education','default','balance','contact','month','campaign','pdays','previous','poutcome','number_of_loan']]

In [515]:
# Separate the target variable from the features
X = train.drop(columns=['Target'])
y = train['Target']

y = y.map({'yes': 1, 'no': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Calculate class weights based on the class distribution
class_ratio = sum(y_train ==0) / sum(y_train == 1)


# Train a machine learning model (e.g., Random Forest) on the resampled data
xgb_model = xgb.XGBClassifier(n_estimators=100,scale_pos_weight=class_ratio, max_depth=15, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)



In [516]:
# Evaluate the model on the test set
f1 = f1_score(y_test, y_pred,pos_label=1)
print("F1-Score:", f1)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

F1-Score: 0.659400544959128
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.71      0.68      1848
           1       0.69      0.63      0.66      1920

    accuracy                           0.67      3768
   macro avg       0.67      0.67      0.67      3768
weighted avg       0.67      0.67      0.67      3768

Confusion Matrix:
[[1308  540]
 [ 710 1210]]


In [518]:
y_pred = xgb_model.predict(test.iloc[:,1:])
SampleSubmition=pd.read_csv("Sample_Submition.csv")
SampleSubmition['Target']=y_pred
SampleSubmition.to_csv('Sample_Submition2.1.csv', index=False)