# Introduction

This notebook answers the following question(s):  

- What is the probability of volunteer making a donation later without volunteering ?
- What is the correlation between donating and volunteering ?


by Fred Etter - December, 2019

In [1]:
# Import modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import sklearn
from sklearn.feature_selection import SelectFromModel
from datetime import datetime
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns

In [2]:
# Read in the data
df = pd.read_csv('file1_12_3.csv', low_memory=False)

In [3]:
# show the first 5 lines of the data
df.head()

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender
0,874ddbce-11cd-e111-941f-00259073dc22,2007,52.0,,,1977.0,female
1,9a4ddbce-11cd-e111-941f-00259073dc22,2007,80.0,,,1968.0,female
2,5e4edbce-11cd-e111-941f-00259073dc22,2007,10000.0,,,1958.0,male
3,c150dbce-11cd-e111-941f-00259073dc22,2007,120.0,,,,male
4,0b53dbce-11cd-e111-941f-00259073dc22,2007,500.16,,,1947.0,female


In [4]:
# show the number of rows and columns of the original data
df.shape

(385722, 7)

In [5]:
# drop Council, Committe, Board members
df = df.drop(df[df.VolType == 'Council, Committe or Board'].index).fillna(0)

In [6]:
df = df.fillna(0)

In [7]:
df.sample(10)

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender
135262,98b73991-702e-4027-99ea-31a9713c7d5a,2012,52.0,0,0.0,0.0,female
322854,86320cce-11cd-e111-941f-00259073dc22,2013,1091.31,0,0.0,0.0,male
344935,bc3cfab3-ecaf-e511-a59d-26d4160798d6,2015,130.0,0,0.0,1987.0,female
103699,e469b7d0-11cd-e111-941f-00259073dc22,2008,240.0,0,0.0,1959.0,female
369499,7f0653fb-4789-e911-812e-0050569e2a4f,2018,25.0,0,0.0,0.0,female
373211,5657e8cf-11cd-e111-941f-00259073dc22,2019,858.0,0,0.0,0.0,male
249385,3c3915d2-11cd-e111-941f-00259073dc22,2014,550.0,0,0.0,0.0,male
277375,cbab9504-ac89-e611-a7f7-26d4160798d6,2018,24.0,0,0.0,0.0,female
57447,03c555d0-11cd-e111-941f-00259073dc22,2014,50.0,0,0.0,0.0,female
192763,dbcc55d0-11cd-e111-941f-00259073dc22,2007,130.0,0,0.0,0.0,male


In [8]:
# df_non_only = df.drop(df[df.VolType == 'Standard'].index).fillna(0)
df_std_only = df.drop(df[df.VolType == 0].index).fillna(0)

In [9]:
# create a series that captures the earliest year that a non-volunteer donated
a = df_std_only.groupby('ContactId')['Year'].transform('min')

In [10]:
# create a new column and populate it:
df_std_only['min_year'] = a

In [11]:
# display a random sample of 10 rows of the dataframe
df_std_only.sample(10)

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender,min_year
383750,3dd755d0-11cd-e111-941f-00259073dc22,2013,1352.0,Standard,1.0,1984.0,female,2010
383325,a1fd81ac-fc25-4ea7-b7c2-ca3d8f39f486,2017,347.88,Standard,1.0,0.0,female,2017
376435,4bfa078e-ab77-e211-a0e0-4040184c1c1a,2012,0.0,Standard,1.0,0.0,female,2012
381533,d7d255d0-11cd-e111-941f-00259073dc22,2013,504.0,Standard,1.0,0.0,male,2013
384702,bb23a584-3475-e211-a0e0-4040184c1c1a,2013,0.0,Standard,2.0,0.0,male,2012
376000,8f55b1d1-11cd-e111-941f-00259073dc22,2011,0.0,Standard,1.0,0.0,female,2011
384522,fdbc3c6c-4456-e511-97d1-26d4160798d6,2016,0.0,Standard,2.0,0.0,female,2015
380193,1e5f22ab-149e-e211-a0e0-4040184c1c1a,2012,0.0,Standard,1.0,1969.0,female,2012
375652,e34cb1d1-11cd-e111-941f-00259073dc22,2011,0.0,Standard,1.0,0.0,female,2011
383295,e1fdf2d4-62f7-439e-a9c8-d49a22a64d89,2017,130.0,Standard,1.0,0.0,female,2017


In [12]:
# drop rows where Year != min_year
df_std_only = df_std_only.drop(df_std_only[df_std_only.Year != df_std_only.min_year].index)

In [13]:
df_std_only.sample(10)

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender,min_year
376750,593f15d2-11cd-e111-941f-00259073dc22,2011,0.0,Standard,1.0,0.0,male,2011
383599,25ef9166-3132-4cc5-ac2d-6f3f3ceeb89f,2014,130.0,Standard,1.0,0.0,male,2014
382553,1b6cf804-3a75-e211-a0e0-4040184c1c1a,2012,0.0,Standard,1.0,0.0,male,2012
381485,20b962d1-11cd-e111-941f-00259073dc22,2013,50.0,Standard,1.0,0.0,male,2013
382548,905ffb29-8ea3-e211-a0e0-4040184c1c1a,2012,0.0,Standard,1.0,0.0,female,2012
378736,9adfc3a4-7224-e711-98e2-005056975e92,2017,0.0,Standard,1.0,0.0,female,2017
379800,23334c7c-a231-e711-b10c-005056975e92,2017,0.0,Standard,1.0,0.0,male,2017
376342,245ce8cf-11cd-e111-941f-00259073dc22,2012,0.0,Standard,1.0,0.0,female,2012
377983,c345e8cf-11cd-e111-941f-00259073dc22,2016,0.0,Standard,1.0,1967.0,female,2016
376102,4827ede2-76f0-e211-a0e0-4040184c1c1a,2011,0.0,Standard,1.0,0.0,female,2011


In [14]:
df_std_only.shape

(6147, 8)

In [15]:
df_std_only = df_std_only.drop_duplicates('ContactId')

In [16]:
df_std_only.shape

(6147, 8)

In [17]:
df = df.drop(df[df.VolType == 'Standard'].index).fillna(0)
df = df.drop_duplicates('ContactId')

In [18]:
df.shape

(115554, 7)

In [19]:
# create a new dataframe
new_df = pd.merge(df_std_only, df, how='left', left_on=['ContactId'], right_on = ['ContactId'])

In [20]:
new_df.shape

(6147, 14)

In [27]:
new_df.sample(5)

Unnamed: 0,ContactId,Year_x,PledgeTotal_x,VolType_x,VolunteerActivityCnt_x,BirthYear_x,Gender_x,min_year,Year_y,PledgeTotal_y,VolType_y,VolunteerActivityCnt_y,BirthYear_y,Gender_y
3481,a41f8a92-e3e3-e211-a0e0-4040184c1c1a,2013,0.0,Standard,1.0,0.0,female,2013,,,,,,
4164,bc50e8cf-11cd-e111-941f-00259073dc22,2012,120.0,Standard,1.0,1950.0,female,2012,2007.0,50.0,0.0,0.0,1950.0,female
3848,f09b654f-7e96-e411-b065-26d4160798d6,2018,0.0,Standard,1.0,1966.0,female,2018,2014.0,450.0,0.0,0.0,1966.0,female
2323,d2e261ab-e86b-e311-a454-4040184c1c1a,2013,0.0,Standard,1.0,0.0,female,2013,,,,,,
2157,a548e8cf-11cd-e111-941f-00259073dc22,2013,0.0,Standard,1.0,1958.0,female,2013,2007.0,100.0,0.0,0.0,1958.0,female


In [28]:
new_df_final = new_df[new_df.Year_x <= new_df.Year_y]

In [29]:
new_df_final.shape

(501, 14)

In [30]:
new_df_final.sample(5)

Unnamed: 0,ContactId,Year_x,PledgeTotal_x,VolType_x,VolunteerActivityCnt_x,BirthYear_x,Gender_x,min_year,Year_y,PledgeTotal_y,VolType_y,VolunteerActivityCnt_y,BirthYear_y,Gender_y
4652,e1a59537-3ec3-e811-8101-0050569e2a4f,2018,0.0,Standard,1.0,0.0,0,2018,2019.0,600.0,0.0,0.0,0.0,0
1843,6053b1d1-11cd-e111-941f-00259073dc22,2010,0.0,Standard,1.0,1981.0,female,2010,2013.0,208.0,0.0,0.0,1981.0,female
5847,34ca3f5e-d3dc-e211-a0e0-4040184c1c1a,2013,1000.0,Standard,2.0,0.0,female,2013,2014.0,1000.0,0.0,0.0,0.0,female
4406,b826039d-86a3-e211-a0e0-4040184c1c1a,2012,0.0,Standard,1.0,0.0,female,2012,2017.0,1040.0,0.0,0.0,0.0,female
1074,25712864-6f87-e311-a454-4040184c1c1a,2014,1020.0,Standard,1.0,1987.0,female,2014,2015.0,1000.0,0.0,0.0,1987.0,female


In [31]:
new_df_final.shape[0]/new_df.shape[0]

0.08150317227916057

# Conclusion:

So, the probability of a volunteer making a donation in a later year without volunteering is about **8.2%**.

One caveat: The supporter may make a donation later in 2019 or after 2019.

In a previous notebook, it was established that a non-volunteer is about **2.2%** likely to volunteer later.  What do these 2 facts say about the correlation between volunteering and non-volunteering (but making a donation) ?  

In short, not much.  

It is fair to say that a volunteer is much more likely to make a donation than someone in the general population because presumably, the percentage of donators in the general population is much less than 8.2%.  

It is also fair to say that a non-volunteer who makes a donation is more likely to volunteer than someone in the general population because 2.2% would be higher than the volunteer rate of the total population.  
