In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('marketing_data2.csv')
df.head()

Unnamed: 0,id,target,day,month,duration,contactId,age,gender,job,maritalStatus,...,creditFailure,accountBalance,house,credit,contactType,numberOfContacts,daySinceLastCampaign,numberOfContactsLastCampaign,lastCampaignResult,groups
0,432148809,no,27,may,166,623,30,female,worker,married,...,no,-202,no,no,unknown,2,-1.0,0,unknown,New Negative
1,432184318,no,26,oct,183,1992,42,female,manager,married,...,no,2463,no,no,cellPhone,2,-1.0,0,unknown,New Negative
2,432182482,no,5,jun,227,2778,26,female,services,single,...,no,2158,yes,yes,landline,1,-1.0,0,unknown,New Negative
3,432150520,no,2,jun,31,3070,34,male,unemployed,divorced,...,yes,75,yes,no,unknown,3,-1.0,0,unknown,New Negative
4,432145870,no,15,may,1231,6583,48,male,worker,married,...,no,559,yes,no,unknown,2,-1.0,0,unknown,New Negative


Now I will convert all the features into numerical values starting with target.

In [3]:
df['target'] = df['target'].replace({'no': 0, 'yes': 1})
df['target'].value_counts()

0    27777
1     3699
Name: target, dtype: int64

Convert months into numerical values with one hot encoding.

In [4]:
months_encoded = pd.get_dummies(df['month'])

df_encoded = pd.concat([df, months_encoded], axis=1)

print(df_encoded)

              id  target  day month  duration  contactId  age  gender  \
0      432148809       0   27   may       166        623   30  female   
1      432184318       0   26   oct       183       1992   42  female   
2      432182482       0    5   jun       227       2778   26  female   
3      432150520       0    2   jun        31       3070   34    male   
4      432145870       0   15   may      1231       6583   48    male   
...          ...     ...  ...   ...       ...        ...  ...     ...   
31471  432184725       1   30   nov      1628   69542367   58  female   
31472  432147139       0   21   may       173   69542565   40  female   
31473  432166958       0   17   nov       422   69543453   51  female   
31474  432166312       0   29   aug        69   69544121   30    male   
31475  432171709       0    2   feb       171   69546604   50    male   

              job maritalStatus  ... dec feb  jan jul jun mar  may  nov  oct  \
0          worker       married  ...   0   

In [5]:
df_encoded = df_encoded.drop('month', axis=1)
print(df_encoded)

              id  target  day  duration  contactId  age  gender         job  \
0      432148809       0   27       166        623   30  female      worker   
1      432184318       0   26       183       1992   42  female     manager   
2      432182482       0    5       227       2778   26  female    services   
3      432150520       0    2        31       3070   34    male  unemployed   
4      432145870       0   15      1231       6583   48    male      worker   
...          ...     ...  ...       ...        ...  ...     ...         ...   
31471  432184725       1   30      1628   69542367   58  female   technical   
31472  432147139       0   21       173   69542565   40  female     manager   
31473  432166958       0   17       422   69543453   51  female      worker   
31474  432166312       0   29        69   69544121   30    male   technical   
31475  432171709       0    2       171   69546604   50    male   technical   

      maritalStatus        education  ... dec  feb 

In [6]:
df_encoded = df_encoded.drop('contactId', axis=1)
print(df_encoded)

              id  target  day  duration  age  gender         job  \
0      432148809       0   27       166   30  female      worker   
1      432184318       0   26       183   42  female     manager   
2      432182482       0    5       227   26  female    services   
3      432150520       0    2        31   34    male  unemployed   
4      432145870       0   15      1231   48    male      worker   
...          ...     ...  ...       ...  ...     ...         ...   
31471  432184725       1   30      1628   58  female   technical   
31472  432147139       0   21       173   40  female     manager   
31473  432166958       0   17       422   51  female      worker   
31474  432166312       0   29        69   30    male   technical   
31475  432171709       0    2       171   50    male   technical   

      maritalStatus        education creditFailure  ...  dec feb jan jul  jun  \
0           married       highSchool            no  ...    0   0   0   0    0   
1           married  

In [7]:
df_encoded['gender'] = df_encoded['gender'].replace({'female': 0, 'male': 1})
df_encoded['gender'].value_counts()

1    15769
0    15707
Name: gender, dtype: int64

In [8]:
df_encoded['creditFailure'] = df_encoded['creditFailure'].replace({'no': 0, 'yes': 1})
df_encoded['creditFailure'].value_counts()

0    30921
1      555
Name: creditFailure, dtype: int64

In [9]:
df_encoded.head()

Unnamed: 0,id,target,day,duration,age,gender,job,maritalStatus,education,creditFailure,...,dec,feb,jan,jul,jun,mar,may,nov,oct,sep
0,432148809,0,27,166,30,0,worker,married,highSchool,0,...,0,0,0,0,0,0,1,0,0,0
1,432184318,0,26,183,42,0,manager,married,uniGraduated,0,...,0,0,0,0,0,0,0,0,1,0
2,432182482,0,5,227,26,0,services,single,highSchool,0,...,0,0,0,0,1,0,0,0,0,0
3,432150520,0,2,31,34,1,unemployed,divorced,uniGraduated,1,...,0,0,0,0,1,0,0,0,0,0
4,432145870,0,15,1231,48,1,worker,married,secondarySchool,0,...,0,0,0,0,0,0,1,0,0,0


In [10]:
df_encoded = df_encoded.drop('id', axis=1)
print(df_encoded)

       target  day  duration  age  gender         job maritalStatus  \
0           0   27       166   30       0      worker       married   
1           0   26       183   42       0     manager       married   
2           0    5       227   26       0    services        single   
3           0    2        31   34       1  unemployed      divorced   
4           0   15      1231   48       1      worker       married   
...       ...  ...       ...  ...     ...         ...           ...   
31471       1   30      1628   58       0   technical       married   
31472       0   21       173   40       0     manager        single   
31473       0   17       422   51       0      worker       married   
31474       0   29        69   30       1   technical       married   
31475       0    2       171   50       1   technical      divorced   

             education  creditFailure  accountBalance  ... dec feb jan  jul  \
0           highSchool              0            -202  ...   0   0  

In [11]:
df_encoded.columns

Index(['target', 'day', 'duration', 'age', 'gender', 'job', 'maritalStatus',
       'education', 'creditFailure', 'accountBalance', 'house', 'credit',
       'contactType', 'numberOfContacts', 'daySinceLastCampaign',
       'numberOfContactsLastCampaign', 'lastCampaignResult', 'groups', 'apr',
       'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct',
       'sep'],
      dtype='object')

In [12]:
df_encoded = df_encoded.drop('day', axis=1)
print(df_encoded)

       target  duration  age  gender         job maritalStatus  \
0           0       166   30       0      worker       married   
1           0       183   42       0     manager       married   
2           0       227   26       0    services        single   
3           0        31   34       1  unemployed      divorced   
4           0      1231   48       1      worker       married   
...       ...       ...  ...     ...         ...           ...   
31471       1      1628   58       0   technical       married   
31472       0       173   40       0     manager        single   
31473       0       422   51       0      worker       married   
31474       0        69   30       1   technical       married   
31475       0       171   50       1   technical      divorced   

             education  creditFailure  accountBalance house  ... dec feb  jan  \
0           highSchool              0            -202    no  ...   0   0    0   
1         uniGraduated              0        

In [13]:
df_encoded['credit'].value_counts()

no     26473
yes     5003
Name: credit, dtype: int64

In [14]:
df_encoded['credit'] = df_encoded['credit'].replace({'no': 0, 'yes': 1})
df_encoded['credit'].value_counts()

0    26473
1     5003
Name: credit, dtype: int64

In [15]:
df_encoded['house'].value_counts()

yes    17503
no     13973
Name: house, dtype: int64

In [16]:
df_encoded['house'] = df_encoded['house'].replace({'no': 0, 'yes': 1})
df_encoded['house'].value_counts()

1    17503
0    13973
Name: house, dtype: int64

In [17]:
df_encoded['lastCampaignResult'].value_counts()

unknown    25743
failure     3403
other       1294
success     1036
Name: lastCampaignResult, dtype: int64

In [18]:
lcr_encoded = pd.get_dummies(df_encoded['lastCampaignResult'])

df_encoded = pd.concat([df_encoded, lcr_encoded], axis=1)

print(df_encoded)

       target  duration  age  gender         job maritalStatus  \
0           0       166   30       0      worker       married   
1           0       183   42       0     manager       married   
2           0       227   26       0    services        single   
3           0        31   34       1  unemployed      divorced   
4           0      1231   48       1      worker       married   
...       ...       ...  ...     ...         ...           ...   
31471       1      1628   58       0   technical       married   
31472       0       173   40       0     manager        single   
31473       0       422   51       0      worker       married   
31474       0        69   30       1   technical       married   
31475       0       171   50       1   technical      divorced   

             education  creditFailure  accountBalance  house  ...  jun mar  \
0           highSchool              0            -202      0  ...    0   0   
1         uniGraduated              0            24

In [19]:
df_encoded = df_encoded.rename(columns={
    'failure': 'lcr_failure',
    'other': 'lcr_other',
    'success': 'lcr_success',
    'unknown': 'lcr_unknown'
})
df_encoded.head()

Unnamed: 0,target,duration,age,gender,job,maritalStatus,education,creditFailure,accountBalance,house,...,jun,mar,may,nov,oct,sep,lcr_failure,lcr_other,lcr_success,lcr_unknown
0,0,166,30,0,worker,married,highSchool,0,-202,0,...,0,0,1,0,0,0,0,0,0,1
1,0,183,42,0,manager,married,uniGraduated,0,2463,0,...,0,0,0,0,1,0,0,0,0,1
2,0,227,26,0,services,single,highSchool,0,2158,1,...,1,0,0,0,0,0,0,0,0,1
3,0,31,34,1,unemployed,divorced,uniGraduated,1,75,1,...,1,0,0,0,0,0,0,0,0,1
4,0,1231,48,1,worker,married,secondarySchool,0,559,1,...,0,0,1,0,0,0,0,0,0,1


In [20]:
df_encoded['job'].value_counts()

worker            6780
manager           6501
technical         5334
administrative    3602
services          2872
retired           1614
selfEmployed      1112
entrepreneur      1029
unemployed         921
houseWife          854
student            645
unknown            212
Name: job, dtype: int64

In [21]:
job_encoded = pd.get_dummies(df_encoded['job'])

df_encoded = pd.concat([df_encoded, job_encoded], axis=1)

print(df_encoded)

       target  duration  age  gender         job maritalStatus  \
0           0       166   30       0      worker       married   
1           0       183   42       0     manager       married   
2           0       227   26       0    services        single   
3           0        31   34       1  unemployed      divorced   
4           0      1231   48       1      worker       married   
...       ...       ...  ...     ...         ...           ...   
31471       1      1628   58       0   technical       married   
31472       0       173   40       0     manager        single   
31473       0       422   51       0      worker       married   
31474       0        69   30       1   technical       married   
31475       0       171   50       1   technical      divorced   

             education  creditFailure  accountBalance  house  ...  houseWife  \
0           highSchool              0            -202      0  ...          0   
1         uniGraduated              0          

In [22]:
df_encoded = df_encoded.rename(columns={
    'unknown': 'job_unknown',
})
df_encoded.head()

Unnamed: 0,target,duration,age,gender,job,maritalStatus,education,creditFailure,accountBalance,house,...,houseWife,manager,retired,selfEmployed,services,student,technical,unemployed,job_unknown,worker
0,0,166,30,0,worker,married,highSchool,0,-202,0,...,0,0,0,0,0,0,0,0,0,1
1,0,183,42,0,manager,married,uniGraduated,0,2463,0,...,0,1,0,0,0,0,0,0,0,0
2,0,227,26,0,services,single,highSchool,0,2158,1,...,0,0,0,0,1,0,0,0,0,0
3,0,31,34,1,unemployed,divorced,uniGraduated,1,75,1,...,0,0,0,0,0,0,0,1,0,0
4,0,1231,48,1,worker,married,secondarySchool,0,559,1,...,0,0,0,0,0,0,0,0,0,1


In [23]:
mars_encoded = pd.get_dummies(df_encoded['maritalStatus'])

df_encoded = pd.concat([df_encoded, mars_encoded], axis=1)

print(df_encoded)

       target  duration  age  gender         job maritalStatus  \
0           0       166   30       0      worker       married   
1           0       183   42       0     manager       married   
2           0       227   26       0    services        single   
3           0        31   34       1  unemployed      divorced   
4           0      1231   48       1      worker       married   
...       ...       ...  ...     ...         ...           ...   
31471       1      1628   58       0   technical       married   
31472       0       173   40       0     manager        single   
31473       0       422   51       0      worker       married   
31474       0        69   30       1   technical       married   
31475       0       171   50       1   technical      divorced   

             education  creditFailure  accountBalance  house  ...  \
0           highSchool              0            -202      0  ...   
1         uniGraduated              0            2463      0  ...   


In [24]:
df_encoded = df_encoded.drop('maritalStatus', axis=1)
print(df_encoded)

       target  duration  age  gender         job        education  \
0           0       166   30       0      worker       highSchool   
1           0       183   42       0     manager     uniGraduated   
2           0       227   26       0    services       highSchool   
3           0        31   34       1  unemployed     uniGraduated   
4           0      1231   48       1      worker  secondarySchool   
...       ...       ...  ...     ...         ...              ...   
31471       1      1628   58       0   technical       highSchool   
31472       0       173   40       0     manager  secondarySchool   
31473       0       422   51       0      worker       highSchool   
31474       0        69   30       1   technical     uniGraduated   
31475       0       171   50       1   technical       highSchool   

       creditFailure  accountBalance  house  credit  ... selfEmployed  \
0                  0            -202      0       0  ...            0   
1                  0     

In [25]:
df_encoded = df_encoded.drop('job', axis=1)
print(df_encoded)

       target  duration  age  gender        education  creditFailure  \
0           0       166   30       0       highSchool              0   
1           0       183   42       0     uniGraduated              0   
2           0       227   26       0       highSchool              0   
3           0        31   34       1     uniGraduated              1   
4           0      1231   48       1  secondarySchool              0   
...       ...       ...  ...     ...              ...            ...   
31471       1      1628   58       0       highSchool              0   
31472       0       173   40       0  secondarySchool              0   
31473       0       422   51       0       highSchool              0   
31474       0        69   30       1     uniGraduated              0   
31475       0       171   50       1       highSchool              0   

       accountBalance  house  credit contactType  ...  selfEmployed  services  \
0                -202      0       0     unknown  ... 

In [26]:
contact_encoded = pd.get_dummies(df_encoded['contactType'])

df_encoded = pd.concat([df_encoded, contact_encoded], axis=1)

print(df_encoded)

       target  duration  age  gender        education  creditFailure  \
0           0       166   30       0       highSchool              0   
1           0       183   42       0     uniGraduated              0   
2           0       227   26       0       highSchool              0   
3           0        31   34       1     uniGraduated              1   
4           0      1231   48       1  secondarySchool              0   
...       ...       ...  ...     ...              ...            ...   
31471       1      1628   58       0       highSchool              0   
31472       0       173   40       0  secondarySchool              0   
31473       0       422   51       0       highSchool              0   
31474       0        69   30       1     uniGraduated              0   
31475       0       171   50       1       highSchool              0   

       accountBalance  house  credit contactType  ...  technical  unemployed  \
0                -202      0       0     unknown  ...  

In [27]:
df_encoded = df_encoded.rename(columns={
    'unknown': 'contacttype_unknown',
})
df_encoded.head()

Unnamed: 0,target,duration,age,gender,education,creditFailure,accountBalance,house,credit,contactType,...,technical,unemployed,job_unknown,worker,divorced,married,single,cellPhone,landline,contacttype_unknown
0,0,166,30,0,highSchool,0,-202,0,0,unknown,...,0,0,0,1,0,1,0,0,0,1
1,0,183,42,0,uniGraduated,0,2463,0,0,cellPhone,...,0,0,0,0,0,1,0,1,0,0
2,0,227,26,0,highSchool,0,2158,1,1,landline,...,0,0,0,0,0,0,1,0,1,0
3,0,31,34,1,uniGraduated,1,75,1,0,unknown,...,0,1,0,0,1,0,0,0,0,1
4,0,1231,48,1,secondarySchool,0,559,1,0,unknown,...,0,0,0,1,0,1,0,0,0,1


In [32]:
df_encoded.head()

Unnamed: 0,target,duration,age,gender,education,creditFailure,accountBalance,house,credit,numberOfContacts,...,technical,unemployed,job_unknown,worker,divorced,married,single,cellPhone,landline,contacttype_unknown
0,0,166,30,0,highSchool,0,-202,0,0,2,...,0,0,0,1,0,1,0,0,0,1
1,0,183,42,0,uniGraduated,0,2463,0,0,2,...,0,0,0,0,0,1,0,1,0,0
2,0,227,26,0,highSchool,0,2158,1,1,1,...,0,0,0,0,0,0,1,0,1,0
3,0,31,34,1,uniGraduated,1,75,1,0,3,...,0,1,0,0,1,0,0,0,0,1
4,0,1231,48,1,secondarySchool,0,559,1,0,2,...,0,0,0,1,0,1,0,0,0,1


In [33]:
edu_encoded = pd.get_dummies(df_encoded['education'])

df_encoded = pd.concat([df_encoded, edu_encoded], axis=1)

print(df_encoded)

       target  duration  age  gender        education  creditFailure  \
0           0       166   30       0       highSchool              0   
1           0       183   42       0     uniGraduated              0   
2           0       227   26       0       highSchool              0   
3           0        31   34       1     uniGraduated              1   
4           0      1231   48       1  secondarySchool              0   
...       ...       ...  ...     ...              ...            ...   
31471       1      1628   58       0       highSchool              0   
31472       0       173   40       0  secondarySchool              0   
31473       0       422   51       0       highSchool              0   
31474       0        69   30       1     uniGraduated              0   
31475       0       171   50       1       highSchool              0   

       accountBalance  house  credit  numberOfContacts  ...  divorced  \
0                -202      0       0                 2  ...   

In [34]:
df_encoded = df_encoded.rename(columns={
    'unknown': 'edu_unknown',
})
df_encoded.head()

Unnamed: 0,target,duration,age,gender,education,creditFailure,accountBalance,house,credit,numberOfContacts,...,divorced,married,single,cellPhone,landline,contacttype_unknown,highSchool,secondarySchool,uniGraduated,edu_unknown
0,0,166,30,0,highSchool,0,-202,0,0,2,...,0,1,0,0,0,1,1,0,0,0
1,0,183,42,0,uniGraduated,0,2463,0,0,2,...,0,1,0,1,0,0,0,0,1,0
2,0,227,26,0,highSchool,0,2158,1,1,1,...,0,0,1,0,1,0,1,0,0,0
3,0,31,34,1,uniGraduated,1,75,1,0,3,...,1,0,0,0,0,1,0,0,1,0
4,0,1231,48,1,secondarySchool,0,559,1,0,2,...,0,1,0,0,0,1,0,1,0,0


In [35]:
df_encoded = df_encoded.drop('education', axis=1)
print(df_encoded)

       target  duration  age  gender  creditFailure  accountBalance  house  \
0           0       166   30       0              0            -202      0   
1           0       183   42       0              0            2463      0   
2           0       227   26       0              0            2158      1   
3           0        31   34       1              1              75      1   
4           0      1231   48       1              0             559      1   
...       ...       ...  ...     ...            ...             ...    ...   
31471       1      1628   58       0              0            3399      0   
31472       0       173   40       0              0             858      1   
31473       0       422   51       0              0            1414      1   
31474       0        69   30       1              0               1      0   
31475       0       171   50       1              0               8      0   

       credit  numberOfContacts  daySinceLastCampaign  ...  div

In [36]:
df_encoded.columns

Index(['target', 'duration', 'age', 'gender', 'creditFailure',
       'accountBalance', 'house', 'credit', 'numberOfContacts',
       'daySinceLastCampaign', 'numberOfContactsLastCampaign',
       'lastCampaignResult', 'groups', 'apr', 'aug', 'dec', 'feb', 'jan',
       'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep', 'lcr_failure',
       'lcr_other', 'lcr_success', 'lcr_unknown', 'administrative',
       'entrepreneur', 'houseWife', 'manager', 'retired', 'selfEmployed',
       'services', 'student', 'technical', 'unemployed', 'job_unknown',
       'worker', 'divorced', 'married', 'single', 'cellPhone', 'landline',
       'contacttype_unknown', 'highSchool', 'secondarySchool', 'uniGraduated',
       'edu_unknown'],
      dtype='object')

In [37]:
df_encoded = df_encoded.drop('lastCampaignResult', axis=1)
print(df_encoded)

       target  duration  age  gender  creditFailure  accountBalance  house  \
0           0       166   30       0              0            -202      0   
1           0       183   42       0              0            2463      0   
2           0       227   26       0              0            2158      1   
3           0        31   34       1              1              75      1   
4           0      1231   48       1              0             559      1   
...       ...       ...  ...     ...            ...             ...    ...   
31471       1      1628   58       0              0            3399      0   
31472       0       173   40       0              0             858      1   
31473       0       422   51       0              0            1414      1   
31474       0        69   30       1              0               1      0   
31475       0       171   50       1              0               8      0   

       credit  numberOfContacts  daySinceLastCampaign  ...  div

In [39]:
df_encoded.groups.value_counts()

New Negative            23369
Super Negative           2964
New Positive             2374
Super Positive            663
Negative to Positive      439
Positive to Negative      373
Name: groups, dtype: int64

In [40]:
df_encoded['groups'] = df_encoded['groups'].replace({
    'New Negative': 'new_neg',
    'Super Negative': 'super_neg',
    'New Positive': 'new_pos',
    'Super Positive': 'super_pos',
    'Negative to Positive': 'neg_to_pos',
    'Positive to Negative': 'pos_to_neg'
})
df_encoded.groups.value_counts()

new_neg       23369
super_neg      2964
new_pos        2374
super_pos       663
neg_to_pos      439
pos_to_neg      373
Name: groups, dtype: int64

In [41]:
grp_encoded = pd.get_dummies(df_encoded['groups'])

df_encoded = pd.concat([df_encoded, grp_encoded], axis=1)

print(df_encoded)

       target  duration  age  gender  creditFailure  accountBalance  house  \
0           0       166   30       0              0            -202      0   
1           0       183   42       0              0            2463      0   
2           0       227   26       0              0            2158      1   
3           0        31   34       1              1              75      1   
4           0      1231   48       1              0             559      1   
...       ...       ...  ...     ...            ...             ...    ...   
31471       1      1628   58       0              0            3399      0   
31472       0       173   40       0              0             858      1   
31473       0       422   51       0              0            1414      1   
31474       0        69   30       1              0               1      0   
31475       0       171   50       1              0               8      0   

       credit  numberOfContacts  daySinceLastCampaign  ...  hig

In [42]:
df_encoded = df_encoded.drop('groups', axis=1)
print(df_encoded)

       target  duration  age  gender  creditFailure  accountBalance  house  \
0           0       166   30       0              0            -202      0   
1           0       183   42       0              0            2463      0   
2           0       227   26       0              0            2158      1   
3           0        31   34       1              1              75      1   
4           0      1231   48       1              0             559      1   
...       ...       ...  ...     ...            ...             ...    ...   
31471       1      1628   58       0              0            3399      0   
31472       0       173   40       0              0             858      1   
31473       0       422   51       0              0            1414      1   
31474       0        69   30       1              0               1      0   
31475       0       171   50       1              0               8      0   

       credit  numberOfContacts  daySinceLastCampaign  ...  hig

In [44]:
df_encoded.columns

Index(['target', 'duration', 'age', 'gender', 'creditFailure',
       'accountBalance', 'house', 'credit', 'numberOfContacts',
       'daySinceLastCampaign', 'numberOfContactsLastCampaign', 'apr', 'aug',
       'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep',
       'lcr_failure', 'lcr_other', 'lcr_success', 'lcr_unknown',
       'administrative', 'entrepreneur', 'houseWife', 'manager', 'retired',
       'selfEmployed', 'services', 'student', 'technical', 'unemployed',
       'job_unknown', 'worker', 'divorced', 'married', 'single', 'cellPhone',
       'landline', 'contacttype_unknown', 'highSchool', 'secondarySchool',
       'uniGraduated', 'edu_unknown', 'neg_to_pos', 'new_neg', 'new_pos',
       'pos_to_neg', 'super_neg', 'super_pos'],
      dtype='object')

In [45]:
#remove the groupings because we already have the target
#using a lot of one hot encoding so no need to standardize
#split data
#test data - the data that is more recent is better
#build model and train

In [46]:
df_encoded = df_encoded.drop(['neg_to_pos', 'new_neg', 'new_pos', 'pos_to_neg', 'super_neg', 'super_pos'], axis=1)
df_encoded.columns

Index(['target', 'duration', 'age', 'gender', 'creditFailure',
       'accountBalance', 'house', 'credit', 'numberOfContacts',
       'daySinceLastCampaign', 'numberOfContactsLastCampaign', 'apr', 'aug',
       'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep',
       'lcr_failure', 'lcr_other', 'lcr_success', 'lcr_unknown',
       'administrative', 'entrepreneur', 'houseWife', 'manager', 'retired',
       'selfEmployed', 'services', 'student', 'technical', 'unemployed',
       'job_unknown', 'worker', 'divorced', 'married', 'single', 'cellPhone',
       'landline', 'contacttype_unknown', 'highSchool', 'secondarySchool',
       'uniGraduated', 'edu_unknown'],
      dtype='object')

In [48]:
from sklearn.model_selection import train_test_split

df_encoded_sorted = df_encoded.sort_values('daySinceLastCampaign')

X = df_encoded_sorted.drop('target', axis=1)
y = df_encoded_sorted['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (25180, 48) (25180,)
Testing set shape: (6296, 48) (6296,)
