# Predict Blood Donations

## Libaries

In [74]:
import pandas as pd
import numpy as np
import plotly.express as px

In [76]:
transfusion = pd.read_csv('data/transfusion.data')
transfusion = transfusion.rename(columns={'whether he/she donated blood in March 2007': 'Target'})
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Recency (months)       748 non-null    int64
 1   Frequency (times)      748 non-null    int64
 2   Monetary (c.c. blood)  748 non-null    int64
 3   Time (months)          748 non-null    int64
 4   Target                 748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [78]:
datatype_chg = ['Recency (months)','Frequency (times)','Time (months)']
transfusion[datatype_chg] = transfusion[datatype_chg].astype('int8')
transfusion['Monetary (c.c. blood)'] = transfusion['Monetary (c.c. blood)'].astype('int16')

In [79]:
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Recency (months)       748 non-null    int8 
 1   Frequency (times)      748 non-null    int8 
 2   Monetary (c.c. blood)  748 non-null    int16
 3   Time (months)          748 non-null    int8 
 4   Target                 748 non-null    int64
dtypes: int16(1), int64(1), int8(3)
memory usage: 9.6 KB


In [80]:
transfusion.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),Target
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [82]:
px.imshow(transfusion.corr())

## Splitting and Scaling Data

In [83]:
X = transfusion.iloc[ : ,: -1]
y = transfusion.iloc[ : , -1]

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [85]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Logistic Regression

In [90]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

In [91]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[131,   1],
       [ 50,   5]])

In [92]:
accuracy_score(y_test, y_pred)

0.7272727272727273

## SVM Linear + RBF

In [93]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)

In [94]:
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[132,   0],
       [ 55,   0]])

In [95]:
accuracy_score(y_test, y_pred)

0.7058823529411765

In [96]:
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train, y_train)

In [97]:
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[131,   1],
       [ 54,   1]])

In [98]:
accuracy_score(y_test, y_pred)

0.7058823529411765

## Random Forest

In [100]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)

In [101]:
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[110,  22],
       [ 39,  16]])

In [102]:
accuracy_score(y_test, y_pred)

0.6737967914438503

In [111]:
np.sum(transfusion['Target']) / transfusion['Target'].count()

0.23796791443850268