<p><b>Analysis of who is more likely to donate in order to target for donations. The data is explored in depth and cleaned. 
Four Machine Learning Classifiers used and evaluated based on accuracy and time</p>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-paper')

In [None]:
file_name = 'data/adult.data'
donor_df = pd.read_csv(file_name)


## EDA - Exploratory Data Analysis

In [None]:
donor_df.head()

In [None]:
# There are no columns names in the dataset above,
column_names = ['age', 'workclass', 'fnlwgt','education','education-num','marital-status','occupation',
                'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country', 'Income']          
donor_df = pd.read_csv(file_name,names = column_names)

In [None]:
donor_df.shape # number of rows, columns

In [None]:
donor_df.info()

In [None]:
donor_df.head()

In [None]:
# check for null values
donor_df.isnull().any(axis=1).sum()

In [None]:
# clearing empty space from the Income column
donor_df['Income'] = donor_df['Income'].str.strip()

In [None]:
# bar plot to visualize the number of donors per Income category
plt.figure(figsize=(10,8))
pd.DataFrame(donor_df.Income.value_counts()).plot.bar()
plt.title("Donors by Income");

In [None]:
# identifying category columns & numerical columns
cat_cols = [col for col in donor_df.columns if (donor_df[col].dtypes=='object')]
num_cols = [col for col in donor_df.columns if (donor_df[col].dtypes !='object')]

In [None]:
print(f'There are {len(cat_cols)} Categorical features:\n{cat_cols}')
print(f'\nThere are {len(num_cols)} Numerical features:\n{num_cols}')

In [None]:
# getting income breakdown by category & excluding native_country (due to it having many unique values)

income_breakdown = pd.concat([pd.crosstab(donor_df[x],donor_df.Income) for x in cat_cols[:-2]],
                            keys=cat_cols[:-2])

In [None]:
income_breakdown.columns = ['less_50K','more_50K']

In [None]:
income_breakdown

In [None]:
# Adding percentage column to the breakdown 
income_breakdown['pct_more_50K'] = round(100 * (income_breakdown['more_50K'] / 
                                       (income_breakdown['less_50K'] + income_breakdown['more_50K'])),2)

In [None]:
income_breakdown

In [None]:
# number of hours compared to salary
hours_income = pd.crosstab(donor_df['hours-per-week'] >= 40,donor_df.Income)
hours_income.columns = ['less_50K','more_50K']
hours_income['pct_more50K'] = round(100 * (hours_income['more_50K'] / 
                                       (hours_income['less_50K'] + hours_income['more_50K'])),2)
print(f'Percentage of people making more than 50K by higher or lower than 40 hours per week: \n\n{hours_income}')

In [None]:
# martial status analysis in focus
income_breakdown.loc['marital-status']

In [None]:
## correlation matrix 
def corr_matrix(df):
    """Plotting the Correlation Matrix of a dataframe
    
    Parameters:
        df: Input dataframe
    Output:
        Correlation Matrix Plot
    
    """
    corr_mat = df.corr()
    plt.figure(figsize=(12,8))
    mask = np.zeros_like(corr_mat)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr_mat,annot=True,mask=mask)
    plt.title("Correlation Matrix",fontsize =17)
    plt.xticks(rotation=45,fontsize =12)
    plt.yticks(rotation=45,fontsize =12)
    plt.tight_layout()


In [None]:
corr_matrix(donor_df)

In [None]:
help(corr_matrix)

In [None]:
# Visualizing the distribution of the age column
plt.figure(figsize=(8,6))
plt.grid(False)
sns.distplot(donor_df.age,bins=20,hist_kws=dict(color='b',edgecolor="k", linewidth=1,alpha=0.3))
plt.title("Age Distribution Plot",fontsize=16);
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel(xlabel="Age",fontsize=14)

plt.tight_layout()

In [None]:
sns.distplot(donor_df.fnlwgt) # the fnlwgt column will be dropped

In [None]:
# testing skewnews and adding skews cols into a list, excluding fnlwgt as the column would be dropped.
skewed_cols = [] # list to contain skewed numerical column names
for col in num_cols:
    # a loop to check skewness of numerical column excluding fnlwgt as it will be dropped
    print(f'{col} is skewed by {donor_df[col].skew():.2f}')
    if (donor_df[col].skew()<=-1 or donor_df[col].skew()>=1) and col != 'fnlwgt': 
        # only columns with higher than 1 or lower than -1 will be added to the list of skewed columns
        skewed_cols.append(col)

print(f'Skewed colums: {skewed_cols}')

In [None]:
plt.figure(figsize=(14,8))
plt.title('Skewed Gain & Loss columns before transformation')
plt.subplot(121)
sns.distplot(donor_df['capital-gain'],kde=False)
plt.subplot(122)
sns.distplot(donor_df['capital-loss'],kde=False)

#plt.title('Skewed Gain & Loss columns')

In [None]:
# try to find which transformation gives the best smoothing of skewnewss
print(f"Log Transformation gives: \n{donor_df[skewed_cols].apply(lambda x: np.log(x+1)).skew()}")
# using sqrt
print(f"\n\nSqrt Transformation gives: \n{donor_df[skewed_cols].apply(lambda x: np.sqrt(x)).skew()}")


In [None]:
# the better transformation seems to be the log transformation
donor_df[skewed_cols] = donor_df[skewed_cols].apply(lambda x: np.log(x+1))

In [None]:
donor_df.head()

In [None]:
# donor potentials: age <20 or age>70
print('Potential donors age <20 or age>70:\n{}'.format(len(donor_df[(donor_df.age < 20) | (donor_df.age > 70)])))

In [None]:
stage1_df = donor_df.copy()

### About the dataset:
<ol>
    <li>marital-status column : could combine Married-AF-spouse	& Married-civ-spouse as one value</li>
    <li>workclass col contains a value '?' to replace with a value</li>
    <li>there seems to be a week to no correlation between the features</li>
    <li>amend the target column 'Income' with 0 & 1? </li>
    <li>There are 9 Category columns & 6 numerical</li>
    <li>remove the fnlwgt column</li>
    <li>use log transformation for capital gain and loss columns</li> 

    
</ol>

In [None]:
stage1_df.head()

### Preprocessing

In [None]:
# remove rows with ? for occupation from dataset
donor_df = donor_df[donor_df.occupation!=' ?']

In [None]:
# removing 500 rows with empty columns
donor_df = donor_df[donor_df['native-country'] != ' ?']

In [None]:
# removing fnlwgt column
donor_df.drop(['fnlwgt'], axis=1, inplace=True)

In [None]:
donor_df.columns

In [None]:
donor_df.head()

### Preparing for Modeling

In [None]:
# using MinMaxScaler to transform numerical data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# creating a new dataframe for transformation
transformed_df = pd.DataFrame(data=donor_df)
transformed_df.head()

In [None]:
transformed_df[num_cols] = scaler.fit_transform(transformed_df[num_cols])

In [None]:
transformed_df.head()

In [None]:
# get binary columns
cols_binary = [col for col in transformed_df.columns if transformed_df[col].nunique()==2]
cols_binary

In [None]:
# transform binary columns into 1 for male, 0 for female
transformed_df['sex'] = transformed_df['sex'].str.strip().replace(to_replace=["Male","Female"],value=[1,0])

In [None]:
# transform income 1 for more than 50K, and 0 for less than 50K
transformed_df["Income"] = transformed_df["Income"].replace(to_replace=["<=50K",">50K"],value=[0,1])

In [None]:
transformed_df.head()

In [None]:
# check one more time for null values
transformed_df.isnull().sum()

In [None]:
# seperate target column
income_target = transformed_df.Income

In [None]:
transformed_df = transformed_df.drop('Income', axis=1)

In [None]:
transformed_df = pd.get_dummies(transformed_df)

In [None]:
transformed_df.head()

## Modeling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# splitting the data into train and test sets
X_train,X_test,y_train,y_test = train_test_split(transformed_df,income_target,test_size=0.3)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_classifier = DecisionTreeClassifier()

In [None]:
dt_classifier.fit(X_train,y_train)

In [None]:
predictions = dt_classifier.predict(X_test)

In [None]:
# accuracy 
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
accuracy_score(y_test,predictions)

In [None]:
dt_cm = confusion_matrix(y_test,predictions)
dt_cm

In [None]:
def cm_plot(model_cm,model_name):
    """Confusion Matrix Plotting
    
    INPUT: 
        model_cm: the model's confusion matrix
        model_name: the model's name which could be retrieved by using type(model)__name__
    
    OUTPUT:
        a plot with the model's name in the title of the plot
    
    """
    plt.clf()
    plt.figure(figsize=(10,8));
    plt.imshow(model_cm, interpolation='nearest', cmap=plt.cm.coolwarm,alpha=0.3)
    classNames = ['Negative','Positive']
    plt.title(f'{model_name} Confusion Matrix',fontsize=16)
    plt.ylabel('True label',fontsize=14)
    plt.xlabel('Predicted label',fontsize=14)
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45,fontsize=12)
    plt.yticks(tick_marks, classNames,fontsize=12)
    s = [['TN','FP'], ['FN', 'TP']]
    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(model_cm[i][j]),fontsize=12,color='black')
    plt.tight_layout()
    plt.show()


In [None]:
cm_plot(dt_cm,type(dt_classifier).__name__);

In [None]:
print(classification_report(y_test,predictions)) # classification report

In [None]:
import time # to measure time required for training the data
def model_accuracy(model, X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test):
    """fitting, predicting the model
    
    INPUT: model, X,y datasets
    
    OUPUT: 
        model_cm: model confusion matrix
        model_name: the name of the input model
        prints:
            - time taken to train the data for the input model
            - accuracy score
            - classification report"""
    start = time.time() # start time of training data
    model.fit(X_train,y_train) # fit the model
    stop = time.time() # end end time of training data
    y_predict = model.predict(X_test) # use the model to predict
    model_name = type(model).__name__ # obtain the model's name
    accuracy = accuracy_score(y_test, y_predict) # assess accuracy
    model_cm = confusion_matrix(y_test,y_predict)# generate confusion matrix
    report = classification_report(y_test,y_predict)
    
    # print out results
    print(f'{model_name} trained in {stop-start:.3f} seconds')
    print(f'\n\n{model}\n\n accuracy: {accuracy*100:.2f}%\n')
    print(f'Classification Report:\n\n {report}\n\n')
    
    #cm_plot(model_cm,model_name); # plot the confusion matrix
    return [model_cm, model_name]

In [None]:
print(model_accuracy.__doc__)

In [None]:
# using different classifiers and assessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier


classifiers = []
# add decision trees
decision_trees = DecisionTreeClassifier()
classifiers.append(decision_trees)
# add SVC classifier
svc_classifier = SVC()
classifiers.append(svc_classifier)

# add Adaboost classifier
adaboost_classifier = AdaBoostClassifier()
classifiers.append(adaboost_classifier)


# add random forest classifier
random_forest_classifier = RandomForestClassifier()
classifiers.append(random_forest_classifier)

classifiers

In [None]:
for model in classifiers:
    assessment = model_accuracy(model)
    cm_plot(assessment[0],assessment[1])
    print('*'*80)
    

# Model Evaluation:
<ul style="color:blue;font-size:25px;">
    <li>AdaBoost Classifier performed with the highest accuracy (85.20%) and the second shortest training time.</li>
    <li>Decision Trees perofrmed the worst with 81.21% accuracy but had the shortest training time</li>
    <li>SVC performed relatively well when it comes to accuracy but had the longest training time of more that 38s</li>
    <li>It is recommended then to go with AdaBoost for this particular project</li>
    <li>To improve the model:
        <ul>
            <li>Unify some of the values under marriage column</li>
            <li>Remove the extremes in the age category (suggested thresholds: age less than 20 or higher than 70)</li>
        </ul>
    </li>


</ul>