# Introduction

This notebook answers the following question(s):  

- What is the probability of a volunteer making a donation within 5 years after volunteering ?


by Fred Etter - December, 2019

In [1]:
# Import modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import sklearn
from sklearn.feature_selection import SelectFromModel
from datetime import datetime
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns

In [2]:
# Read in the data
df = pd.read_csv('file1_12_3.csv', low_memory=False)

In [3]:
# show the first 5 lines of the data
df.head()

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender
0,874ddbce-11cd-e111-941f-00259073dc22,2007,52.0,,,1977.0,female
1,9a4ddbce-11cd-e111-941f-00259073dc22,2007,80.0,,,1968.0,female
2,5e4edbce-11cd-e111-941f-00259073dc22,2007,10000.0,,,1958.0,male
3,c150dbce-11cd-e111-941f-00259073dc22,2007,120.0,,,,male
4,0b53dbce-11cd-e111-941f-00259073dc22,2007,500.16,,,1947.0,female


In [4]:
# show the number of rows and columns of the original data
df.shape

(385722, 7)

In [5]:
# drop Council, Committe, Board members
df = df.drop(df[df.VolType == 'Council, Committe or Board'].index).fillna(0)

In [6]:
df = df.fillna(0)

In [7]:
df.sample(10)

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender
167791,8f56dbce-11cd-e111-941f-00259073dc22,2016,52.0,0,0.0,0.0,male
338380,0f62dbce-11cd-e111-941f-00259073dc22,2015,155.0,0,0.0,1966.0,female
319482,883915d2-11cd-e111-941f-00259073dc22,2011,208.0,0,0.0,1956.0,female
300435,8f6ab7d0-11cd-e111-941f-00259073dc22,2009,241.2,0,0.0,0.0,male
80572,cc4cb1d1-11cd-e111-941f-00259073dc22,2018,300.0,0,0.0,0.0,female
265154,c85a5704-a719-4a19-93d1-d048b6c86c47,2016,25.0,0,0.0,0.0,female
374992,7926d4d9-406e-4a60-b7da-ae35043381a2,2018,130.0,0,0.0,0.0,female
190905,d061b7d0-11cd-e111-941f-00259073dc22,2007,50.0,0,0.0,1980.0,female
249580,2405dcd9-68eb-4b0f-a0fc-35fbcbdaa6bb,2014,52.0,0,0.0,0.0,female
34336,3f51b7d0-11cd-e111-941f-00259073dc22,2011,52.0,0,0.0,0.0,female


In [8]:
# create dataframe of only standard volunteers:
df_std_only = df.drop(df[df.VolType == 0].index).fillna(0)

In [9]:
# create a series that captures the earliest year that a volunteer donated
a = df_std_only.groupby('ContactId')['Year'].transform('min')

In [10]:
# create a new column and populate it:
df_std_only['min_year'] = a

In [11]:
# display a random sample of 10 rows of the dataframe
df_std_only.sample(10)

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender,min_year
381489,148603d1-11cd-e111-941f-00259073dc22,2014,52.0,Standard,1.0,1984.0,male,2014
383905,896ab7d0-11cd-e111-941f-00259073dc22,2012,720.0,Standard,1.0,0.0,female,2011
378774,dd9ef15d-b7dd-e211-a0e0-4040184c1c1a,2013,0.0,Standard,1.0,0.0,male,2013
376427,5d3eb1d1-11cd-e111-941f-00259073dc22,2013,0.0,Standard,1.0,0.0,female,2013
376201,f63e15d2-11cd-e111-941f-00259073dc22,2011,0.0,Standard,1.0,0.0,male,2011
382545,b3d735cf-11cd-e111-941f-00259073dc22,2011,0.0,Standard,1.0,0.0,female,2011
379506,6053b1d1-11cd-e111-941f-00259073dc22,2012,182.0,Standard,1.0,1981.0,female,2010
383566,794615d2-11cd-e111-941f-00259073dc22,2014,52.0,Standard,1.0,1954.0,male,2014
377160,fe0369a8-7ec3-4c57-ade7-0f3ce30ce32d,2013,25.0,Standard,1.0,0.0,female,2013
384228,01b15282-a231-e711-b10c-005056975e92,2017,0.0,Standard,2.0,0.0,female,2017


In [12]:
# drop rows where Year does not equal min_year
df_std_only = df_std_only.drop(df_std_only[df_std_only.Year != df_std_only.min_year].index)

In [13]:
df_std_only.sample(10)

Unnamed: 0,ContactId,Year,PledgeTotal,VolType,VolunteerActivityCnt,BirthYear,Gender,min_year
383945,b2c055d0-11cd-e111-941f-00259073dc22,2012,520.0,Standard,1.0,0.0,female,2012
379646,2049b1d1-11cd-e111-941f-00259073dc22,2011,52.0,Standard,1.0,0.0,female,2011
381540,6495264d-9923-e211-a46f-404056a2f946,2013,52.0,Standard,1.0,1961.0,female,2013
377663,6244e8cf-11cd-e111-941f-00259073dc22,2012,0.0,Standard,1.0,0.0,female,2012
380541,630aeda8-b0fe-e211-95aa-4040184c1c1a,2013,0.0,Standard,1.0,0.0,female,2013
380293,9882620c-51e2-e211-a0e0-4040184c1c1a,2013,0.0,Standard,1.0,0.0,female,2013
383753,72ac62d1-11cd-e111-941f-00259073dc22,2012,50.0,Standard,1.0,1978.0,female,2012
377898,1389a5c2-0d5e-e411-ac49-26d4160798d6,2014,0.0,Standard,1.0,0.0,0,2014
382907,a2acac88-f825-e311-a975-4040184c1c1a,2013,0.0,Standard,1.0,0.0,female,2013
378576,e9a0ca86-5e24-e311-95aa-4040184c1c1a,2013,0.0,Standard,1.0,0.0,female,2013


In [14]:
df_std_only.shape

(6147, 8)

In [15]:
df_std_only = df_std_only.drop_duplicates('ContactId')

In [16]:
df_std_only.shape

(6147, 8)

In [17]:
df = df.drop(df[df.VolType == 'Standard'].index).fillna(0)
df = df.drop_duplicates('ContactId')

In [18]:
df.shape

(115554, 7)

In [19]:
# create a new dataframe
new_df = pd.merge(df_std_only, df, how='left', left_on=['ContactId'], right_on = ['ContactId'])

In [20]:
new_df.shape

(6147, 14)

In [21]:
new_df.sample(5)

Unnamed: 0,ContactId,Year_x,PledgeTotal_x,VolType_x,VolunteerActivityCnt_x,BirthYear_x,Gender_x,min_year,Year_y,PledgeTotal_y,VolType_y,VolunteerActivityCnt_y,BirthYear_y,Gender_y
4151,2cd755d0-11cd-e111-941f-00259073dc22,2012,52.0,Standard,1.0,0.0,female,2012,2008.0,52.0,0.0,0.0,0.0,female
146,3cb62dbd-e86b-e311-a454-4040184c1c1a,2013,0.0,Standard,1.0,0.0,female,2013,,,,,,
3067,50460dfc-6a79-e811-80ea-0050569e2a4f,2018,0.0,Standard,1.0,0.0,female,2018,,,,,,
2732,024ce8cf-11cd-e111-941f-00259073dc22,2012,52.0,Standard,1.0,1973.0,female,2012,2007.0,52.0,0.0,0.0,1973.0,female
116,b1d7f470-f662-e511-9bbb-26d4160798d6,2015,0.0,Standard,1.0,0.0,female,2015,,,,,,


In [22]:
new_df_final = new_df[new_df.Year_x <= new_df.Year_y]

In [23]:
new_df_final.shape

(501, 14)

In [24]:
new_df_final = new_df_final[new_df.Year_y <= new_df.Year_x + 5]

  """Entry point for launching an IPython kernel.


In [25]:
new_df_final.shape

(476, 14)

In [26]:
new_df_final.sample(5)

Unnamed: 0,ContactId,Year_x,PledgeTotal_x,VolType_x,VolunteerActivityCnt_x,BirthYear_x,Gender_x,min_year,Year_y,PledgeTotal_y,VolType_y,VolunteerActivityCnt_y,BirthYear_y,Gender_y
3707,abe96b93-54c9-e211-a0e0-4040184c1c1a,2012,0.0,Standard,1.0,0.0,female,2012,2014.0,102.76,0.0,0.0,0.0,female
2594,23cb5e0a-2680-e211-a0e0-4040184c1c1a,2012,120.0,Standard,1.0,0.0,female,2012,2013.0,120.0,0.0,0.0,0.0,female
2646,bdcf55f6-bd01-4ba5-98d2-4809c07cceab,2012,52.0,Standard,1.0,0.0,female,2012,2013.0,52.0,0.0,0.0,0.0,female
3987,11204a91-9e68-e611-9fe2-26d4160798d6,2016,250.0,Standard,1.0,1971.0,female,2016,2018.0,267.28,0.0,0.0,1971.0,female
1735,e7d235cf-11cd-e111-941f-00259073dc22,2012,0.0,Standard,1.0,0.0,female,2012,2016.0,52.0,0.0,0.0,0.0,female


In [27]:
new_df_final.shape[0]/new_df.shape[0]

0.0774361477143322

# Conclusion:

So, the probability of a volunteer making a donation within 5 years is about **7.7%**.

One caveat: The supporter may make a donation later in 2019 or after 2019.
