In [37]:
# Libaries

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.feature_selection import RFE
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# 1. Read datasets

In [2]:
numerical = pd.read_csv(r"C:\Users\ljant\Desktop\Ironhack\15_Revision_and_Hypothesis-Testing\numerical.csv")
targets = pd.read_csv(r"C:\Users\ljant\Desktop\Ironhack\15_Revision_and_Hypothesis-Testing\target.csv")
donors = pd.concat([numerical, targets], axis = 1)
donors

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,0,60.000000,5,9,0,0,39,34,18,10,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,11.0,10.0,9,6.812500,172556,1,4,41,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,25.0,25.0,9,25.000000,184568,0,1,12,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,20.0,20.0,9,20.000000,122706,1,1,2,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,10.0,10.0,3,8.285714,189641,1,3,34,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,21.0,18.0,4,12.146341,4693,1,4,11,1,18.0


In [3]:
def header_function (data):
    cols = []
    for i in range (len(data.columns)):
        cols.append(data.columns[i].lower().replace(' ','_'))
    
    data.columns = cols

    return data

header_function(donors)

Unnamed: 0,tcode,age,income,wealth1,hit,malemili,malevet,vietvets,wwiivets,localgov,...,maxramnt,lastgift,timelag,avggift,controln,hphone_d,rfa_2f,cluster2,target_b,target_d
0,0,60.000000,5,9,0,0,39,34,18,10,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,11.0,10.0,9,6.812500,172556,1,4,41,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,25.0,25.0,9,25.000000,184568,0,1,12,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,20.0,20.0,9,20.000000,122706,1,1,2,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,10.0,10.0,3,8.285714,189641,1,3,34,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,21.0,18.0,4,12.146341,4693,1,4,11,1,18.0


# 2. Clean data

2.1 Get numerical data

In [4]:
data_types = donors.dtypes
print (data_types)

cat_columns = data_types[data_types == 'object']
print("Non-numeric columns:\n", cat_columns)

# Result: No non-numeric columns

tcode         int64
age         float64
income        int64
wealth1       int64
hit           int64
             ...   
hphone_d      int64
rfa_2f        int64
cluster2      int64
target_b      int64
target_d    float64
Length: 317, dtype: object
Non-numeric columns:
 Series([], dtype: object)


2.2. Dealing with NaNs

In [5]:
sum_na = donors.isna().sum()
print (sum_na)

nan_columns = sum_na[sum_na != 0]
print('Columns with NaN values:\n', nan_columns)

# Result: No columns with NaN values

tcode       0
age         0
income      0
wealth1     0
hit         0
           ..
hphone_d    0
rfa_2f      0
cluster2    0
target_b    0
target_d    0
Length: 317, dtype: int64
Columns with NaN values:
 Series([], dtype: int64)


# 3. Train-Test-Split

In [66]:
X = donors.drop(['target_b', 'target_d'], axis = 1)
Y = donors['target_b']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 4. Logistic regression model

In [23]:
model_b = LogisticRegression()
model_b.fit(X_train, Y_train)
result = model_b.score(X_train, Y_train)
print("The accuracy of the model is = ", round(result,2))

# Accuracy of the model is 95 %

The accuracy of the model is =  0.95


In [35]:
# Selecting only relevant columns (RFE)

lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select = 10)
rfe.fit(X_train,Y_train)

# After running it, it labels the top features as 1
df = pd.DataFrame(rfe.ranking_, columns=['Rank'])
df['Column_name'] = X_train.columns
df[df['Rank']==1]

Unnamed: 0,Rank,Column_name
3,1,income
20,1,pop90c4
21,1,pop90c5
31,1,eth10
77,1,dw3
79,1,dw5
81,1,dw7
82,1,dw8
83,1,dw9
97,1,hhd5


In [38]:
# Assigning the select columns from the RFE to the model
top_features = df[df['Rank'] == 1]['Column_name']

X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

In [40]:
# Running the model again
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_train, Y_train)
print("The accuracy of the model is = ", round(result,2))

# did not improve the model - keeping the original model

0,1,2,3
Dep. Variable:,target_b,R-squared (uncentered):,0.056
Model:,OLS,Adj. R-squared (uncentered):,0.056
Method:,Least Squares,F-statistic:,228.5
Date:,"Fri, 19 Jan 2024",Prob (F-statistic):,0.0
Time:,19:31:12,Log-Likelihood:,7761.4
No. Observations:,76329,AIC:,-15480.0
Df Residuals:,76309,BIC:,-15300.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
income,0.0026,0.000,5.699,0.000,0.002,0.004
pop90c4,0.0120,0.024,0.501,0.616,-0.035,0.059
pop90c5,0.0119,0.024,0.498,0.619,-0.035,0.059
eth10,0.0028,0.001,3.500,0.000,0.001,0.004
dw3,-0.0034,0.002,-2.096,0.036,-0.007,-0.000
dw5,-0.0034,0.002,-2.115,0.034,-0.007,-0.000
dw7,0.0024,0.005,0.484,0.628,-0.007,0.012
dw8,-0.0030,0.005,-0.613,0.540,-0.013,0.007
dw9,-0.0026,0.005,-0.528,0.597,-0.012,0.007

0,1,2,3
Omnibus:,61299.833,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,889842.59
Skew:,4.061,Prob(JB):,0.0
Kurtosis:,17.623,Cond. No.,6880.0


# 4. Predictions

Predictions for 'target_b'

In [24]:
predictions_target_b = model_b.predict(X_test)

print('Accuracy is: ',  accuracy_score(Y_test, predictions_target_b))
print('Precision is: ', precision_score(Y_test, predictions_target_b))
print('Recall is: ',  recall_score(Y_test, predictions_target_b))
print('F1 is: ',  f1_score(Y_test, predictions_target_b))

print(classification_report(Y_test, predictions_target_b))

Accuracy is:  0.9476497406068228
Precision is:  0.043478260869565216
Recall is:  0.0010224948875255625
F1 is:  0.001998001998001998
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     18105
           1       0.04      0.00      0.00       978

    accuracy                           0.95     19083
   macro avg       0.50      0.50      0.49     19083
weighted avg       0.90      0.95      0.92     19083



In [None]:
'''
Interpretation:
Eventhough accuracy is hight, due to the high imblance in the data, precision and recall is very low.
I need to handle the imbalance to improve the data.
'''

# 5. Model for 'target_d'
Train Data

In [31]:
# 1. Make predictions on the train set
predictions_target_b_train = model_b.predict(X_train)

# 2. Filter the data for donors
X_train_d = X_train[predictions_target_b_train == 1]
Y_train_d = Y_train[predictions_target_b_train == 1]

Test Data

In [33]:
# 1. Make predictions on the test set
predictions_target_b_test = model_b.predict(X_test)

# 2. Filter the data for donors
X_test_d = X_test[predictions_target_b_test == 1]
Y_test_d = Y_test[predictions_target_b_test == 1]

In [34]:
# 2. Train Regression Model for 'target_d' (linear regression)
model_d = sm.OLS(Y_train_d, X_train_d).fit()
model_d.summary()

0,1,2,3
Dep. Variable:,target_b,R-squared (uncentered):,0.483
Model:,OLS,Adj. R-squared (uncentered):,0.2
Method:,Least Squares,F-statistic:,1.707
Date:,"Sat, 20 Jan 2024",Prob (F-statistic):,0.0452
Time:,15:15:26,Log-Likelihood:,25.382
No. Observations:,82,AIC:,7.236
Df Residuals:,53,BIC:,77.03
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
tcode,-9.861e-06,1.94e-05,-0.508,0.614,-4.88e-05,2.91e-05
age,0.0012,0.006,0.208,0.836,-0.010,0.013
income,0.1279,0.057,2.250,0.029,0.014,0.242
wealth1,-0.0612,0.028,-2.201,0.032,-0.117,-0.005
hit,0.0067,0.039,0.173,0.863,-0.071,0.084
malemili,0.0033,0.004,0.848,0.400,-0.005,0.011
malevet,0.0058,0.007,0.793,0.431,-0.009,0.021
vietvets,0.0003,0.006,0.057,0.954,-0.012,0.012
wwiivets,0.0011,0.007,0.173,0.863,-0.012,0.014

0,1,2,3
Omnibus:,69.085,Durbin-Watson:,1.951
Prob(Omnibus):,0.0,Jarque-Bera (JB):,423.018
Skew:,2.638,Prob(JB):,1.39e-92
Kurtosis:,12.797,Cond. No.,1e+16


In [40]:
predictions_target_d = model_d.predict(X_test_d)

In [41]:
mse = mean_squared_error(Y_test_d, predictions_target_d)
mae = mean_absolute_error(Y_test_d, predictions_target_d)
print("R2 value is = ",round(r2_score(Y_test_d, predictions_target_d),2))
print("The mse of the model is = ", round(mse,2))
print("The root mse of the model is = ",round(np.sqrt(mse),2))
print("The mean absolute error of the model is = ",round(mae,2))

R2 value is =  -0.8
The mse of the model is =  0.07
The root mse of the model is =  0.27
The mean absolute error of the model is =  0.14


# 6. Improve Model by dealing with Imbalenced Data

- SMOTE (Oversampling)

In [None]:
X = donors.drop(['target_b', 'target_d'], axis = 1)
Y = donors['target_b']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [67]:
sm = SMOTE(random_state = 0, sampling_strategy = 1.0)
X_train_SMOTE, Y_train_SMOTE = sm.fit_resample(X_train, Y_train)
X_train_SMOTE

# Build model
model = LogisticRegression(max_iter = 1000)
model.fit(X_train_SMOTE, Y_train_SMOTE)

In [68]:
result = model.score(X_train, Y_train)
print("The accuracy of the model is = ", round(result,2))

The accuracy of the model is =  0.59


In [69]:
predictions = model.predict(X_train_SMOTE)

print('Accuracy is: ',  accuracy_score(Y_train_SMOTE, predictions))
print('Precision is: ', precision_score(Y_train_SMOTE, predictions))
print('Recall is: ',  recall_score(Y_train_SMOTE, predictions))
print('F1 is: ',  f1_score(Y_train_SMOTE, predictions))

print(classification_report(Y_train_SMOTE, predictions))

Accuracy is:  0.6267042945462574
Precision is:  0.6182284088128871
Recall is:  0.6625496798410245
F1 is:  0.6396221764819515
              precision    recall  f1-score   support

           0       0.64      0.59      0.61     72464
           1       0.62      0.66      0.64     72464

    accuracy                           0.63    144928
   macro avg       0.63      0.63      0.63    144928
weighted avg       0.63      0.63      0.63    144928



- TomekLinks (undersampling)

In [72]:
X = donors.drop(['target_b', 'target_d'], axis = 1)
Y = donors['target_b']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [76]:
Y_train_TL.value_counts()

0    70377
1     3865
Name: target_b, dtype: int64

In [73]:
tl = TomekLinks()
X_train_TL, Y_train_TL = tl.fit_resample(X_train, Y_train)

# Build model
model = LogisticRegression(max_iter = 1000)
model.fit(X_train_TL, Y_train_TL)

In [74]:
predictions = model.predict(X_train_TL)

print('Accuracy is: ',  accuracy_score(Y_train_TL, predictions))
print('Precision is: ', precision_score(Y_train_TL, predictions))
print('Recall is: ',  recall_score(Y_train_TL, predictions))
print('F1 is: ',  f1_score(Y_train_TL, predictions))

print(classification_report(Y_train_TL, predictions))

Accuracy is:  0.9479001104496108
Precision is:  0.0
Recall is:  0.0
F1 is:  0.0
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     70377
           1       0.00      0.00      0.00      3865

    accuracy                           0.95     74242
   macro avg       0.47      0.50      0.49     74242
weighted avg       0.90      0.95      0.92     74242



In [77]:
Y_train_TL.value_counts()

0    70377
1     3865
Name: target_b, dtype: int64

In [78]:
'''
The explanation for the poor improvement of the model lays in this case in the inefficient TomekLinks method.
An imbalance of the data is still given and influences the model greatly.
'''

''