In [1]:
# 1. Import the required libraries and modules that you would need.

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# 2. Analizying the data into Python and call the dataframe donors.
donors = pd.read_csv('learningSet.txt')

# 3. Checking the datatypes of all the columns in the data.

data_types = donors.dtypes
print(data_types)

ODATEDW       int64
OSOURCE      object
TCODE         int64
STATE        object
ZIP          object
             ...   
MDMAUD_R     object
MDMAUD_F     object
MDMAUD_A     object
CLUSTER2    float64
GEOCODE2     object
Length: 481, dtype: object


In [2]:
# Cleaning Data

def cleaning (data):
    cols = []
    for i in range (len(data.columns)):
        cols.append(data.columns[i].lower().replace(' ','_'))
    
    data.columns = cols

    return data

cleaning(donors)

Unnamed: 0,odatedw,osource,tcode,state,zip,mailcode,pvastate,dob,noexch,recinhse,...,target_d,hphone_d,rfa_2r,rfa_2f,rfa_2a,mdmaud_r,mdmaud_f,mdmaud_a,cluster2,geocode2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0.0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0.0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0.0,1,L,4,E,X,X,X,60.0,C
3,8701,BRY,0,CA,95953,,,2801,0,,...,0.0,1,L,4,E,X,X,X,41.0,C
4,8601,,0,FL,33176,,,2001,0,X,...,0.0,1,L,2,F,X,X,X,26.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,ASE,1,AK,99504,,,0,0,,...,0.0,0,L,1,G,X,X,X,12.0,C
95408,9601,DCD,1,TX,77379,,,5001,0,,...,0.0,1,L,1,F,X,X,X,2.0,A
95409,9501,MBC,1,MI,48910,,,3801,0,,...,0.0,1,L,3,E,X,X,X,34.0,B
95410,8601,PRV,0,CA,91320,,,4005,0,X,...,18.0,1,L,4,F,X,X,X,11.0,A


In [3]:
# Dealing with null values and then organize them in ascending order
nulls_percent_df= donors.isna().sum()/len(donors)
nulls_percent_df

# Inserting the null values in a dataframe
nulls_percent_df= pd.DataFrame(donors.isna().sum()/len(donors))
nulls_percent_df

# Taking out the index 
nulls_percent_df= pd.DataFrame(donors.isna().sum()/len(donors)).reset_index()
nulls_percent_df

# Updating the column names as per learned in the class
nulls_percent_df.columns = ['column_name', 'nulls_percentage']
nulls_percent_df

# Sorting the dataset
nulls_percent_df.sort_values(by = ['nulls_percentage'], ascending = False)

Unnamed: 0,column_name,nulls_percentage
414,rdate_5,0.999906
436,ramnt_5,0.999906
412,rdate_3,0.997464
434,ramnt_3,0.997464
413,rdate_4,0.997055
...,...,...
168,ethc3,0.000000
167,ethc2,0.000000
166,ethc1,0.000000
165,hhd12,0.000000


In [4]:
# droping the nulls values with a threshold 
threshold = 0.25 

# defining the condition
condition = nulls_percent_df['nulls_percentage']>threshold
columns_above_threshold = nulls_percent_df[condition]
columns_above_threshold

# Create a list with column names
drop_columns_list = list(columns_above_threshold['column_name'])
drop_columns_list

# Drop the columns in the columns drop list
donors = donors.drop(columns=drop_columns_list)
donors 

Unnamed: 0,odatedw,osource,tcode,state,zip,mailcode,pvastate,dob,noexch,recinhse,...,target_d,hphone_d,rfa_2r,rfa_2f,rfa_2a,mdmaud_r,mdmaud_f,mdmaud_a,cluster2,geocode2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0.0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0.0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0.0,1,L,4,E,X,X,X,60.0,C
3,8701,BRY,0,CA,95953,,,2801,0,,...,0.0,1,L,4,E,X,X,X,41.0,C
4,8601,,0,FL,33176,,,2001,0,X,...,0.0,1,L,2,F,X,X,X,26.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,ASE,1,AK,99504,,,0,0,,...,0.0,0,L,1,G,X,X,X,12.0,C
95408,9601,DCD,1,TX,77379,,,5001,0,,...,0.0,1,L,1,F,X,X,X,2.0,A
95409,9501,MBC,1,MI,48910,,,3801,0,,...,0.0,1,L,3,E,X,X,X,34.0,B
95410,8601,PRV,0,CA,91320,,,4005,0,X,...,18.0,1,L,4,F,X,X,X,11.0,A


In [5]:
donors['zip']=pd.to_numeric(donors['zip'], errors='coerce')


In [6]:
# Spliting the data between numerical and categorical 

numerical = donors.select_dtypes(include = np.number)
categorical = donors.select_dtypes(object)
categorical

Unnamed: 0,osource,state,mailcode,pvastate,noexch,recinhse,recp3,recpgvg,recsweep,mdmaud,...,rfa_21,rfa_22,rfa_23,rfa_24,rfa_2r,rfa_2a,mdmaud_r,mdmaud_f,mdmaud_a,geocode2
0,GRI,IL,,,0,,,,,XXXX,...,S4E,S4E,S4E,S4E,L,E,X,X,X,C
1,BOA,CA,,,0,,,,,XXXX,...,N1E,N1E,,F1E,L,G,X,X,X,A
2,AMH,NC,,,0,,,,,XXXX,...,,S4D,S4D,S3D,L,E,X,X,X,C
3,BRY,CA,,,0,,,,,XXXX,...,A1D,A1D,,,L,E,X,X,X,C
4,,FL,,,0,X,X,,,XXXX,...,A3D,I4E,A3D,A3D,L,F,X,X,X,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,ASE,AK,,,0,,,,,XXXX,...,,,,,L,G,X,X,X,C
95408,DCD,TX,,,0,,,,,XXXX,...,,,,,L,F,X,X,X,A
95409,MBC,MI,,,0,,X,,,XXXX,...,P1D,P1D,,,L,E,X,X,X,B
95410,PRV,CA,,,0,X,,,,XXXX,...,A1F,A1F,S2F,S3F,L,F,X,X,X,A


In [7]:
# Concatenating bote datasets as X and define y has the target value 
donors = pd.concat([numerical, categorical], axis=1)
donors

X = donors.drop(['target_b', 'target_d'], axis=1)
y = donors['target_b']

In [8]:
# Proceeding with the train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Spliting into train_numerical and test_numerical and train_categorical and test_categorical

train_numerical = X_train.select_dtypes(include = np.number)
test_numerical = X_test.select_dtypes(include = np.number)
train_categorical = X_train.select_dtypes(object)
test_categorical = X_test.select_dtypes(object)

test_categorical

Unnamed: 0,osource,state,mailcode,pvastate,noexch,recinhse,recp3,recpgvg,recsweep,mdmaud,...,rfa_21,rfa_22,rfa_23,rfa_24,rfa_2r,rfa_2a,mdmaud_r,mdmaud_f,mdmaud_a,geocode2
84155,AML,ID,,,0,,,,,XXXX,...,,,,,L,G,X,X,X,C
75272,LEO,CA,,,0,,,,,XXXX,...,,S2D,,S2D,L,E,X,X,X,A
39719,USB,MI,,,0,,,,,XXXX,...,,N3D,,F1C,L,D,X,X,X,A
44288,SYN,CA,,,0,,,,,XXXX,...,,,,,L,F,X,X,X,A
35420,NPP,AZ,,,0,,,,,XXXX,...,A1E,A1E,A1E,A2E,L,G,X,X,X,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38584,L02,AL,,,0,,,,,XXXX,...,,,,,L,F,X,X,X,C
54025,SYN,TX,,,0,,,,,XXXX,...,F1D,F1D,,F1D,L,F,X,X,X,A
76819,ARG,LA,,,0,,,,,XXXX,...,F1D,F1D,,,L,F,X,X,X,B
2549,CRG,OR,,,0,,,,,XXXX,...,,,,,L,F,X,X,X,B


In [10]:
# Normalizing for train_numerical data
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(train_numerical)
normalized_data = pd.DataFrame(normalized_data, columns=train_numerical.columns)

In [12]:
# Normalizing for test_numerical data
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(test_numerical)
normalized_data = pd.DataFrame(normalized_data, columns=test_numerical.columns)

In [13]:
def frequent_values(df, n=3, replace_value='other'):

    for column in df.columns:
        value_counts = df[column].value_counts()
        top_n_values = value_counts.index[:n]

        df[column] = df[column].apply(lambda x: x if x in top_n_values else replace_value)

    return df

In [14]:
train_categorical = frequent_values(train_categorical, n=3, replace_value='other')
test_categorical = frequent_values(test_categorical, n=3, replace_value='other')

In [15]:
# One Hot Encoding

def one_hot_encode(df):
  
    for column in df:
        unique_values = df[column].unique()
        
        if len(unique_values):
            df = pd.get_dummies(df, columns=[column], drop_first=False, dtype=int)
              
    return df

In [16]:
train_categorical = one_hot_encode(train_categorical)
test_categorical = one_hot_encode(test_categorical)
train_categorical

Unnamed: 0,osource_AML,osource_MBC,osource_SYN,osource_other,state_CA,state_FL,state_TX,state_other,mailcode_,mailcode_B,...,mdmaud_f_X,mdmaud_f_other,mdmaud_a_C,mdmaud_a_L,mdmaud_a_X,mdmaud_a_other,geocode2_A,geocode2_B,geocode2_D,geocode2_other
85225,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
70004,0,0,0,1,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,1,0
88133,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,0
79106,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
35476,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0,0,0,1,1,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,0
54886,0,0,1,0,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
76820,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
860,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,0


In [17]:
X_train = pd.concat([train_numerical, train_categorical], axis=1)
X_train
X_test = pd.concat([test_numerical, test_categorical], axis=1)
X_test

Unnamed: 0,odatedw,tcode,zip,dob,age,income,hit,malemili,malevet,vietvets,...,mdmaud_f_X,mdmaud_f_other,mdmaud_a_C,mdmaud_a_M,mdmaud_a_X,mdmaud_a_other,geocode2_A,geocode2_B,geocode2_D,geocode2_other
84155,9601,1,83703.0,5801,40.0,6.0,11,0,28,33,...,1,0,0,0,1,0,0,0,0,1
75272,9101,0,91723.0,2303,75.0,,0,1,20,33,...,1,0,0,0,1,0,1,0,0,0
39719,9401,0,48203.0,4601,52.0,7.0,8,0,39,19,...,1,0,0,0,1,0,1,0,0,0
44288,9301,0,91006.0,4401,54.0,7.0,0,0,26,20,...,1,0,0,0,1,0,1,0,0,0
35420,9101,1,85076.0,4201,56.0,5.0,2,0,52,8,...,1,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38584,9401,2,35747.0,0,,,0,0,33,30,...,1,0,0,0,1,0,0,0,0,1
54025,9401,1,77029.0,3201,66.0,5.0,0,0,8,0,...,1,0,0,0,1,0,1,0,0,0
76819,9401,0,70114.0,4701,51.0,6.0,0,4,35,34,...,1,0,0,0,1,0,0,1,0,0
2549,9601,0,,5901,39.0,4.0,1,1,29,31,...,1,0,0,0,1,0,0,1,0,0


In [20]:
print("Columns in X_train:", X_train.columns)
print("Columns in X_test:", X_test.columns)

Columns in X_train: Index(['odatedw', 'tcode', 'zip', 'dob', 'age', 'income', 'hit', 'malemili',
       'malevet', 'vietvets',
       ...
       'mdmaud_f_X', 'mdmaud_f_other', 'mdmaud_a_C', 'mdmaud_a_L',
       'mdmaud_a_X', 'mdmaud_a_other', 'geocode2_A', 'geocode2_B',
       'geocode2_D', 'geocode2_other'],
      dtype='object', length=570)
Columns in X_test: Index(['odatedw', 'tcode', 'zip', 'dob', 'age', 'income', 'hit', 'malemili',
       'malevet', 'vietvets',
       ...
       'mdmaud_f_X', 'mdmaud_f_other', 'mdmaud_a_C', 'mdmaud_a_M',
       'mdmaud_a_X', 'mdmaud_a_other', 'geocode2_A', 'geocode2_B',
       'geocode2_D', 'geocode2_other'],
      dtype='object', length=569)


In [18]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
LR.score(X_test, y_test)

# predictions
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report 

prediction = LR.predict(X_test)
prediction
print('Precision is:', precision_score(y_test, prediction))
print('Recall is:' , recall_score(y_test, prediction))
print('F1 is:' , f1_score(y_test, prediction))

print(classification_report(y_test, prediction))

# confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, prediction)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [22]:
# Oversampling

donors['target_b'].value_counts()

numerical = donors.select_dtypes(include=np.number)
numerical = numerical.dropna(axis = 1)
numerical

Unnamed: 0,odatedw,tcode,dob,hit,malemili,malevet,vietvets,wwiivets,localgov,stategov,...,maxrdate,lastgift,lastdate,fistdate,avggift,controln,target_b,target_d,hphone_d,rfa_2f
0,8901,0,3712,0,0,39,34,18,10,2,...,9402,10.0,9512,8911,7.741935,95515,0,0.0,0,4
1,9401,1,5202,16,0,15,55,11,6,2,...,9512,25.0,9512,9310,15.666667,148535,0,0.0,0,2
2,9001,1,0,2,0,20,29,33,6,8,...,9207,5.0,9512,9001,7.481481,15078,0,0.0,1,4
3,8701,0,2801,2,0,23,14,31,3,0,...,9411,10.0,9512,8702,6.812500,172556,0,0.0,1,4
4,8601,0,2001,60,1,28,9,53,26,3,...,9601,15.0,9601,7903,6.864865,7112,0,0.0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,1,0,0,14,36,47,11,7,8,...,9602,25.0,9602,9602,25.000000,184568,0,0.0,0,1
95408,9601,1,5001,1,0,31,43,19,4,1,...,9603,20.0,9603,9603,20.000000,122706,0,0.0,1,1
95409,9501,1,3801,0,0,18,46,20,7,23,...,9501,10.0,9610,9410,8.285714,189641,0,0.0,1,3
95410,8601,0,4005,0,0,28,35,20,9,1,...,9608,18.0,9701,8612,12.146341,4693,1,18.0,1,4


In [23]:
# Smote

smote = SMOTE()
X = numerical.drop("target_b", axis = 1)
y = numerical["target_b"]
x_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: target_b, dtype: int64

In [24]:
# Logistic Regression

data_smote = pd.concat([x_sm, y_sm], axis = 1)
data_smote


Unnamed: 0,odatedw,tcode,dob,hit,malemili,malevet,vietvets,wwiivets,localgov,stategov,...,maxrdate,lastgift,lastdate,fistdate,avggift,controln,target_d,hphone_d,rfa_2f,target_b
0,8901,0,3712,0,0,39,34,18,10,2,...,9402,10.000000,9512,8911,7.741935,95515,0.000000,0,4,0
1,9401,1,5202,16,0,15,55,11,6,2,...,9512,25.000000,9512,9310,15.666667,148535,0.000000,0,2,0
2,9001,1,0,2,0,20,29,33,6,8,...,9207,5.000000,9512,9001,7.481481,15078,0.000000,1,4,0
3,8701,0,2801,2,0,23,14,31,3,0,...,9411,10.000000,9512,8702,6.812500,172556,0.000000,1,4,0
4,8601,0,2001,60,1,28,9,53,26,3,...,9601,15.000000,9601,7903,6.864865,7112,0.000000,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181133,8795,866,1731,0,0,32,4,63,4,4,...,9512,21.348194,9512,8805,10.937484,162476,30.785550,0,1,1
181134,8724,1,2425,13,0,23,30,33,4,3,...,9438,11.546004,9537,8733,9.585577,38588,13.618402,1,1,1
181135,9085,0,0,0,0,26,31,34,6,8,...,9296,7.304807,9508,9089,6.126099,21177,7.016706,0,3,1
181136,8998,0,17,0,0,40,48,22,4,4,...,9207,8.028496,9602,9006,7.409017,139563,8.014248,0,4,1


In [26]:
X = data_smote.drop('target_b',axis = 1)
y = data_smote['target_b']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.57      0.67      0.61     18054
           1       0.60      0.50      0.54     18174

    accuracy                           0.58     36228
   macro avg       0.58      0.58      0.58     36228
weighted avg       0.58      0.58      0.58     36228



In [28]:
# Downsampling

# Tomeklinks

category_0 = numerical[numerical['target_b']==0]
category_1 = numerical[numerical['target_b']==1]

tl = TomekLinks()
X = numerical.drop('target_b', axis=1)
y = numerical['target_b']

X_tl, y_tl = tl.fit_resample(X,y)
y_tl.value_counts()

0    88286
1     4843
Name: target_b, dtype: int64

In [29]:
# Logistic Regression

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     18105
           1       0.00      0.00      0.00       978

    accuracy                           0.95     19083
   macro avg       0.47      0.50      0.49     19083
weighted avg       0.90      0.95      0.92     19083

