In [1]:
# Remember: library imports are ALWAYS at the top of the script, no exceptions!
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
import warnings
warnings.filterwarnings('ignore')
import dtale

from itertools import product
from pandas_profiling import ProfileReport


%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'

# Seeting seaborn style
sns.set()


from datetime import datetime
import calendar

In [67]:
data = pd.read_csv ("/Users/vsa/Documents/DM/data/donors.csv", low_memory=False)

In [54]:
#data = alldata[['DOB', 'ODATEDW', 'FISTDATE', 'LASTDATE', 'ADATE_2', 'TCODE',
#               'GENDER', 'NGIFTALL']]

## Calculating age, recency etc

In [68]:
# AGE
# Here we calculate age as it was discussed during Q&A session:
# Calculate age on the moment of ADATE_2, paying attention that ADATE_2 has 2 different values in a column:
# 2017-06-01 (95399 instances), 2017-04-01 (13 instances)

data.DOB = pd.to_datetime(data.DOB)
data.ADATE_2 = pd.to_datetime(data.ADATE_2)
data["AGE"] = (data.ADATE_2 - data.DOB).astype('timedelta64[Y]')    #.astype('int')

data.AGE

0        59.0
1        45.0
2         NaN
3        69.0
4        77.0
         ... 
95407     NaN
95408    47.0
95409    59.0
95410    57.0
95411    79.0
Name: AGE, Length: 95412, dtype: float64

In [35]:
# Next I will create and fill out column 'RECENCY' based on information from metadata file
# to find inconsistencies as per forum all donors should be lapsed

# F=FIRST TIME DONOR Anyone who has made their first donation in the last 6 months and has
# made just one donation.
    
# N=NEW DONOR Anyone who has made their first donation in the last 12 months and is not a First time donor. 
# This is everyone who made their first donation 7-12 months ago, or
# people who made their first donation between 0-6 months ago and have made 2 or more donations.
    
# A=ACTIVE DONOR Anyone who made their first donation more than 12 months ago and has made
# a donation in the last 12 months.
    
# L=LAPSING DONOR A previous donor who made their last donation between 13-24 months ago.
    
# I=INACTIVE DONOR A previous donor who has not made a donation in the last 24 months. 
# It is people who made a donation 25+ months ago.
    
# S=STAR DONOR STAR Donors are individuals who have given to 3 consecutive card mailings.


Let's find out all these groups of donors and create 'RECENCY' column

In [69]:
# Calculate amount of months from the last day LASTDATE (Date associated with the most recent gift)
# and ADATE_2 - Date the latest promotion was sent 
# (all the timelags between dates are calculating on the moment of ADATE_2 as was discussed during Q&A)

# From metadafata file - ODATEDW and FISTDATE have the same meaning:
# ODATEDW Date of donor's first gift to PVA YYMM format (Year/Month) and FISTDATE Date of first gift

data.LASTDATE = pd.to_datetime(data.LASTDATE)
data ['LASTDATE_MONTHS'] = (data.ADATE_2 - data.LASTDATE).astype('timedelta64[M]') 

# Convert ODATEDW to datetime
data.ODATEDW = pd.to_datetime(data.ODATEDW)
data ['ODATEDW_MONTHS'] = (data.ADATE_2 - data.ODATEDW).astype('timedelta64[M]')

# Convert FISTDATE to datetime
data.FISTDATE = pd.to_datetime(data.FISTDATE)
data ['FIRSTDATE_MONTHS'] = (data.ADATE_2 - data.FISTDATE).astype('timedelta64[M]')

In [70]:
# Checking for inconsistencies between LASTDATE_MONTHS and FIRSTDATE_MONTHS
inconsist = (len(data [data ['LASTDATE_MONTHS'] > data['FIRSTDATE_MONTHS']]))
print('Number of inconsistencies between LASTDATE_MONTHS and FIRSTDATE_MONTHS:', inconsist)

Number of inconsistencies between LASTDATE_MONTHS and FIRSTDATE_MONTHS: 0


In [71]:
# Checking for inconsistencies between LASTDATE_MONTHS and ODATEDW_MONTHS
inconsist = (len(data [data ['LASTDATE_MONTHS'] > data['ODATEDW_MONTHS']]))
print('Number of inconsistencies between LASTDATE_MONTHS and ODATEDW_MONTHS:', inconsist)
print('Percentage of inconsistencies between LASTDATE_MONTHS and ODATEDW_MONTHS:', inconsist/len(data)*100)

# The percentage of inconsistencies in ODATEDW_MONTHS is quite high - 3.8 %
# So we have two columns with the same meaning:
# ODATEDW (Date of donor's first gift to PVA YYMM format (Year/Month) and FISTDATE (Date of first gift)

# and one of them (ODATEDW) is inconsistent with others - I suggest to not use this column


Number of inconsistencies between LASTDATE_MONTHS and ODATEDW_MONTHS: 3627
Percentage of inconsistencies between LASTDATE_MONTHS and ODATEDW_MONTHS: 3.8014086278455537


#### Create column with the Recency, using an abreviation from metadata file (F, N, A, L, I donors)

From the Project Forum:
- You can assume RFA_2R is the correct one since the donors in the dataset are supposed to be lapsed ones. The lapsed donors in this dataset are determined according to the date the last promotion (17NK) was emailed to each one of them and the date of their most recent gift. Inconsistent observations are the ones which have an interval between these two dates smaller than 13 months
- This dataset is supposed to be composed only by lapsed donors. Maybe we weren't very explicit on this but PVA is particularly interested on this segment of customers, hence the data being only from this segment. The observations you found are actually inconsistencies. You should deal with them somehow and provide an explanation on how you did it.
https://elearning.novaims.unl.pt/moodle/mod/forum/discuss.php?d=6304


In [72]:
data['RECENCY']= ''
# Print Lapsed donours (that made their last donation to PVA 13 to 24 months ago )

print('Number of lapsed donors donors:')
print(len(data [(data ['LASTDATE_MONTHS'] >=13.0)& (data ['LASTDATE_MONTHS'] <= 24.0)]))
data.loc[(data['LASTDATE_MONTHS'] >= 13.0)& (data['LASTDATE_MONTHS'] <= 24.0), 'RECENCY'] = 'L'

Number of lapsed donors donors:
84132


In [73]:
#  Print number of FIRST TIME DONOR donours (F=FIRST TIME DONOR Anyone who has made their first donation 
# in the last 6 months and has and has made just one donation.)

print('Number of first time donors donors:')
print(len(data [(data['FIRSTDATE_MONTHS'] <= 6.0) & ((data['NGIFTALL'] == 1))]))
data.loc[(data['FIRSTDATE_MONTHS'] <= 6.0) & (data['NGIFTALL'] == 1), 'RECENCY'] = 'F'

Number of first time donors donors:
0


In [74]:
# N = NEW DONOR Anyone who has made their first donation in the last 12 months and is not a First time donor.  
# This is everyone who made their first donation 7-12 months ago, or
# people who made their first donation between 0-6 months ago and have made 2 or more donations.

print('Number of new donors:')
print(len(data.loc[(data['FIRSTDATE_MONTHS'] <= 12.0) & (data['RECENCY'] != 'F') ]))

data.loc[(data['FIRSTDATE_MONTHS'] <= 12.0) & (data['RECENCY'] != 'F') ] = 'N'


Number of new donors:
0


In [75]:
# I=INACTIVE DONOR A previous donor who has not made a donation in the last 24 months.  
# It is people who made a donation 25+ months ago.
print('Number of inactive donors:')
print(len(data.loc[(data['LASTDATE_MONTHS'] >= 25.0)&(data['FIRSTDATE_MONTHS'] >= 25.0)]))

data.loc[(data['LASTDATE_MONTHS'] >= 25.0)&(data['FIRSTDATE_MONTHS'] >= 25.0), 'RECENCY'] = 'I'

Number of inactive donors:
7315


In [63]:
# A=ACTIVE DONOR Anyone who made their first donation more than 12 months ago 
# and has made a donation in the last 12 months.

print('Number of active donors:')
print(len(data.loc[(data['FIRSTDATE_MONTHS'] >= 12.0) & (data['LASTDATE_MONTHS'] < 12.0)]))

data.loc[(data['FIRSTDATE_MONTHS'] >= 12.0) & (data['LASTDATE_MONTHS'] < 12.0), 'RECENCY'] = 'A'

Number of active donors:
3963


In [76]:
# There are 2 rows left without RECENCY label - after checking FISTDATE,
# we see that they correspond to 2 NaNs values in FISTDATE column
# We will need impute them or drop

data.loc[data.RECENCY == '']

Unnamed: 0.1,Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,...,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2,AGE,LASTDATE_MONTHS,ODATEDW_MONTHS,FIRSTDATE_MONTHS,RECENCY
13,13,2012-01-01 00:00:00,HCC,1,LA,70791,,,NaT,0,...,G,X,X,X,B,,5,64,55,
44,44,2012-01-01 00:00:00,IMA,0,WI,53221,,,1966-11-01 00:00:00,0,...,F,X,X,X,B,50,3,64,56,
50,50,2008-01-01 00:00:00,DNA,0,IL,61273,,,1934-11-01 00:00:00,0,...,F,X,X,X,B,82,3,112,103,
66,66,2012-01-01 00:00:00,L01,1,FL,34231,,,1938-05-01 00:00:00,0,...,G,X,X,X,B,79,4,64,56,
74,74,2008-01-01 00:00:00,APP,0,CA,92163,,,1938-01-01 00:00:00,0,...,E,X,X,X,A,79,4,112,104,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95372,95372,2014-01-01 00:00:00,ESN,0,FL,33418,,P,1940-01-01 00:00:00,0,...,G,X,X,X,B,77,3,40,40,
95396,95396,2011-01-01 00:00:00,PBL,0,CA,94305,,,1934-01-01 00:00:00,0,...,G,L,2,C,A,83,9,76,76,
95409,95409,2015-01-01 00:00:00,MBC,1,MI,48910,,,1958-01-01 00:00:00,0,...,E,X,X,X,B,59,7,28,32,
95410,95410,2006-01-01 00:00:00,PRV,0,CA,91320,,,1960-05-01 00:00:00,0,...,F,X,X,X,A,57,4,136,125,


In [77]:
# As we have columns with the same purpose we can impute these NaNs in FISTDATE using ODATEDW
data.loc[(data.RECENCY == ''), 'FISTDATE'] = data.ODATEDW
data.loc[(data.RECENCY == ''), 'FIRSTDATE_MONTHS'] = data.ODATEDW_MONTHS

# And fill out their Recency
data.loc[(data['FIRSTDATE_MONTHS'] >= 12.0) & (data['LASTDATE_MONTHS'] < 12.0), 'RECENCY'] = 'A'

# Drop ODATEDW as not needed anymore
#data.drop(columns = ['ODATEDW', 'ODATEDW_MONTHS'])

Despite fact that we should have only Lapsed donors in our dataset
we have only 84132 lapsed, inactive - 7315  and active 3965.

So we have 11.6 % of inconsistancies - donors that should be lapsed because of RFA_2
but are inactive or active

In [30]:
(95412 - 84312)/95412

0.11633756760155956

In [78]:
dtale.show(data.loc[data.RECENCY != 'L'])



#### Checking for incontsistancies between date of first gift of a donor and his/her birhday date

In [31]:
# Date of donor's first gift to PVA
# cannot be earlier than his/her Date of Birthday (DOB)

# From metadafata file - ODATEDW and FISTDATE have the same meaning:
# ODATEDW Date of donor's first gift to PVA YYMM format (Year/Month) and FISTDATE Date of first gift

# I use strict '<' not '<=' , thinking that some parents in the day of birth of his child could do first donation
# from this child's name to PVA
print ('Number of inconsistencies between ODATEDW and DOB', len(data[data.ODATEDW < data.DOB]))

# As we see ODATEDW is not consistent with DOB

Number of inconsistencies between ODATEDW and DOB 304


In [32]:
# Same checking for FISTDATE
print ('Number of inconsistencies between FISTDATE and DOB', len(data[data.FISTDATE < data.DOB]))
print ('Percentage of inconsistencies between FISTDATE and DOB', 
       round(len(data[data.FISTDATE < data.DOB])/len(data)*100, 2))
# With such small percent - I suggest to drop these rows
# and drop column ODATEDW

Number of inconsistencies between FISTDATE and DOB 278
Percentage of inconsistencies between FISTDATE and DOB 0.29


##### CHECKING FOR INCONSISTENCIES IN TCODE

In [18]:
dictTCODE = {
    "0" : "_",
    "1" : "MR.",
    "1001" : "MESSRS.",
    "1002" : "MR. & MRS.",
    "2" : "MRS.",
    "2002" : "MESDAMES",
    "3" : "MISS",
    "3003" : "MISSES",
    "4" : "DR.",
    "4002" : "DR. & MRS.",
    "4004" : "DOCTORS",
    "5" : "MADAME",
    "6" : "SERGEANT",
    "9" : "RABBI",
    "10" : "PROFESSOR",
    "10002" : "PROFESSOR & MRS.",
    "10010" : "PROFESSORS",
    "11" : "ADMIRAL",
    "11002" : "ADMIRAL & MRS.",
    "12" : "GENERAL",
    "12002" : "GENERAL & MRS.",
    "13" : "COLONEL",
    "13002" : "COLONEL & MRS.",
    "14" : "CAPTAIN",
    "14002" : "CAPTAIN & MRS.",
    "15" : "COMMANDER",
    "15002" : "COMMANDER & MRS.",
    "16" : "DEAN",
    "17" : "JUDGE",
    "17002" : "JUDGE & MRS.",
    "18" : "MAJOR",
    "18002" : "MAJOR & MRS.",
    "19" : "SENATOR",
    "20" : "GOVERNOR",
    "21002" : "SERGEANT & MRS.",
    "22002" : "COLONEL & MRS.", #mistype in COLONEL, was changed here
    "24002" : "LIEUTENANT & MRS.", #XX002 code is used for someone & MRS.
    "24" : "LIEUTENANT",
    "26" : "MONSIGNOR",
    "27" : "REVEREND",
    "28" : "MS.",
    "28028" : "MSS.",
    "29" : "BISHOP",
    "31" : "AMBASSADOR",
    "31002" : "AMBASSADOR & MRS.",
    "33" : "CANTOR",
    "36" : "BROTHER",
    "37" : "SIR",
    "38" : "COMMODORE",
    "40" : "FATHER",
    "42" : "SISTER",
    "43" : "PRESIDENT",
    "44" : "MASTER",
    "46" : "MOTHER",
    "47" : "CHAPLAIN",
    "48" : "CORPORAL",
    "50" : "ELDER",
    "56" : "MAYOR",
    "59002" : "LIEUTENANT & MRS.",
    "62" : "LORD",
    "63" : "CARDINAL",
    "64" : "FRIEND",
    "65" : "FRIENDS",
    "68" : "ARCHDEACON",
    "69" : "CANON",
    "70" : "BISHOP",
    "72002" : "REVEREND & MRS.",
    "73" : "PASTOR",
    "75" : "ARCHBISHOP",
    "85" : "SPECIALIST",
    "87" : "PRIVATE",
    "89" : "SEAMAN", 
    "90" : "AIRMAN",
    "91" : "JUSTICE",
    "92" : "MR. JUSTICE",
    "100" : "M.",
    "103" : "MLLE.",
    "104" : "CHANCELLOR",
    "106" : "REPRESENTATIVE",
    "107" : "SECRETARY",
    "108" : "LT. GOVERNOR",
    "109" : "LIC.",
    "111" : "SA.",
    "114" : "DA.",
    "116" : "MR.", #Changed SR ro MR
    "117" : "MRS.", #Changed SRA ro MRS
    "118" : "MISS", #Cnanged SRTA. to MISS
    "120" : "YOUR MAJESTY",
    "122" : "HIS HIGHNESS",
    "123" : "HER HIGHNESS",
    "124" : "COUNT",
    "125" : "LADY",
    "126" : "PRINCE",
    "127" : "PRINCESS",
    "128" : "CHIEF",
    "129" : "BARON",
    "130" : "SHEIK",
    "131" : "PRINCE AND PRINCESS",
    "132" : "YOUR IMPERIAL MAJEST",
    "135" : "M. ET MME.",
    "210" : "PROF.",
    '72': 'REVEREND', #because code 72002 REVEREND AND MRS, I think it was 072 and 027 for REVEREND 
    #because some systematic mistake (mistype), and in the end someone changed it in system
    '22': "COLONEL & MRS.", #just a mistype,
    
}

In [19]:
data['TCODE'] = data['TCODE'].apply(str)

data["cat_TCODE"] = data.TCODE.replace(to_replace=dictTCODE)

CHECKING FOR INCONSISTENCIES IN TCODE
- wrote down in doc all summary

In [20]:
# CHECKING FOR INCONSISTENCIES IN TCODE
print('Number of Men(GENDER M) with title MRS.(Mrs - womans title of married woman)', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'MRS.')]))
print('Number of men with title MISS (title of young or unmarried woman)', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'MISS')]))
print('Number of men with title MS. (title of married or not married woman)', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'MS.')]))
print('Number of women with title MR. (stays for mister, mans title)', 
      len(data[(data.GENDER == 'F')&(data.cat_TCODE == 'MR.')]))
print('Number of men with title Sister', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'SISTER')]))
print('Number of women with title Brother', 
      len(data[(data.GENDER == 'F')&(data.cat_TCODE == 'BROTHER')]))
#women (F) that has code 'MR. & MRS.' that implies that they should be me
print('Number of women with title MR. & MRS., that implies that they should be men',
      len(data[(data.GENDER == 'F')&(data.cat_TCODE == 'MR. & MRS.')]))

Number of Men(GENDER M) with title MRS.(Mrs - womans title of married woman) 3412
Number of men with title MISS (title of young or unmarried woman) 34
Number of men with title MS. (title of married or not married woman) 216
Number of women with title MR. (stays for mister, mans title) 255
Number of men with title Sister 1
Number of women with title Brother 0
Number of women with title MR. & MRS., that implies that they should be men 46
