In [1]:
# Remember: library imports are ALWAYS at the top of the script, no exceptions!
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

import dtale

from itertools import product
from pandas_profiling import ProfileReport


%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'

# Seeting seaborn style
sns.set()


from datetime import datetime
import calendar

ModuleNotFoundError: No module named 'dtale'

In [2]:
data = pd.read_csv ("/Users/vsa/Documents/DM/data/donors.csv", low_memory=False)

In [3]:
# For a ML model, low variance feature may not add a lot of information and thus can be ignored for modeling.

# Low variance columns could be seen from dtale:
# 'MBCOLECT', 'PUBPHOTO', 'ETH12', 'TPE5', 'TPE6', 'AFC3', 'HC15'


# Choosing columns of interest 
col_df = data[['STATE',
               'ZIP',
               'MAILCODE',
               'TCODE',
               'PVASTATE',
               'DOB',
               'DOMAIN',
               'HOMEOWNR',
               'NUMCHLD',
               'INCOME',
               'GENDER',
               'WEALTH1', 
               'LOCALGOV',
               'STATEGOV',
               'FEDGOV',
               'SOLIH',
               'WEALTH2', #HC (highly correlated) with WEALTH1 (0.86) - could be used to fill nans and drop 1
               'VETERANS',
               'AGE901',
 #              'HHP1', #HC (highly correlated) with RHP3 (0.97)
               'HV1',
               'HV3',
               'RHP1',
               'RHP3',
               'RHP4',
               'MSA',
               'DMA',
               'IC1',
               'IC2',
               'IC15',
               'IC16',
               'IC17',
               'IC18',
               'IC19',
               'IC20',
               'IC21',
               'IC22',
               'IC23',
#               'OEDC1', # HC with LOCALGOV 0.82
#               'OEDC2', # HC with STATEGOV 0.85
#               'OEDC3', # HC with FEDGOV 0.86
               'OEDC4',
               'OEDC5',
               'OEDC6',
               'OEDC7',
               'EC1',
               'AC1',
               'AC2',
               'RFA_2R',
               'RFA_2F',
               'RFA_2A',
               
               'RFA_3', #need to be split to 3 columns as RFA 2, has additional info about status from 2016
               
               'MDMAUD_R',
               'MDMAUD_F',
               'MDMAUD_A',
               'CONTROLN',
               'CARDPROM', 
               'NUMPROM', #HC with CARDPROM - 0.95
               'RAMNTALL',
               'NGIFTALL',# HC with CARDGIFT, no need to have an info about all gifts if we explore card response
               'CARDGIFT',
               'LASTGIFT',
               'AVGGIFT',
               'CARDPM12',
               'NUMPRM12',
               'PEPSTRFL']]

In [4]:
df = data.iloc[:15000, :].copy()

In [5]:
# Dimension of the dataset
df.shape

(15000, 476)

In [6]:
# Check duplicates in data
any(data.duplicated())

False

In [7]:
#Number of missing values in DOB
len(data[data.DOB.isnull()])

23883

In [8]:

# df.DOB  = df.DOB.map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
# df['YOB']  = df.DOB.map(lambda x: x.year)

## Coherence Checks
Date of donor's first gift to PVA
cannot be earlier than his/her Date of Birthday (DOB)

In [14]:
len(data[data.ODATEDW <= data.DOB])

318

In [12]:

# AGE RELATED
data.DOB = pd.to_datetime(data.DOB)
data["AGE"] = np.int16(2020 - pd.to_datetime(data.DOB).dt.year)
data.AGE = data.AGE.where(data.AGE != 0, np.nan)

In [13]:
dictTCODE = {
    "0" : "_",
    "1" : "MR.",
    "1001" : "MESSRS.",
    "1002" : "MR. & MRS.",
    "2" : "MRS.",
    "2002" : "MESDAMES",
    "3" : "MISS",
    "3003" : "MISSES",
    "4" : "DR.",
    "4002" : "DR. & MRS.",
    "4004" : "DOCTORS",
    "5" : "MADAME",
    "6" : "SERGEANT",
    "9" : "RABBI",
    "10" : "PROFESSOR",
    "10002" : "PROFESSOR & MRS.",
    "10010" : "PROFESSORS",
    "11" : "ADMIRAL",
    "11002" : "ADMIRAL & MRS.",
    "12" : "GENERAL",
    "12002" : "GENERAL & MRS.",
    "13" : "COLONEL",
    "13002" : "COLONEL & MRS.",
    "14" : "CAPTAIN",
    "14002" : "CAPTAIN & MRS.",
    "15" : "COMMANDER",
    "15002" : "COMMANDER & MRS.",
    "16" : "DEAN",
    "17" : "JUDGE",
    "17002" : "JUDGE & MRS.",
    "18" : "MAJOR",
    "18002" : "MAJOR & MRS.",
    "19" : "SENATOR",
    "20" : "GOVERNOR",
    "21002" : "SERGEANT & MRS.",
    "22002" : "COLONEL & MRS.", #mistype in COLONEL, was changed here
    "24002" : "LIEUTENANT & MRS.", #XX002 code is used for someone & MRS.
    "24" : "LIEUTENANT",
    "26" : "MONSIGNOR",
    "27" : "REVEREND",
    "28" : "MS.",
    "28028" : "MSS.",
    "29" : "BISHOP",
    "31" : "AMBASSADOR",
    "31002" : "AMBASSADOR & MRS.",
    "33" : "CANTOR",
    "36" : "BROTHER",
    "37" : "SIR",
    "38" : "COMMODORE",
    "40" : "FATHER",
    "42" : "SISTER",
    "43" : "PRESIDENT",
    "44" : "MASTER",
    "46" : "MOTHER",
    "47" : "CHAPLAIN",
    "48" : "CORPORAL",
    "50" : "ELDER",
    "56" : "MAYOR",
    "59002" : "LIEUTENANT & MRS.",
    "62" : "LORD",
    "63" : "CARDINAL",
    "64" : "FRIEND",
    "65" : "FRIENDS",
    "68" : "ARCHDEACON",
    "69" : "CANON",
    "70" : "BISHOP",
    "72002" : "REVEREND & MRS.",
    "73" : "PASTOR",
    "75" : "ARCHBISHOP",
    "85" : "SPECIALIST",
    "87" : "PRIVATE",
    "89" : "SEAMAN", 
    "90" : "AIRMAN",
    "91" : "JUSTICE",
    "92" : "MR. JUSTICE",
    "100" : "M.",
    "103" : "MLLE.",
    "104" : "CHANCELLOR",
    "106" : "REPRESENTATIVE",
    "107" : "SECRETARY",
    "108" : "LT. GOVERNOR",
    "109" : "LIC.",
    "111" : "SA.",
    "114" : "DA.",
    "116" : "MR.", #Changed SR ro MR
    "117" : "MRS.", #Changed SRA ro MRS
    "118" : "MISS", #Cnanged SRTA. to MISS
    "120" : "YOUR MAJESTY",
    "122" : "HIS HIGHNESS",
    "123" : "HER HIGHNESS",
    "124" : "COUNT",
    "125" : "LADY",
    "126" : "PRINCE",
    "127" : "PRINCESS",
    "128" : "CHIEF",
    "129" : "BARON",
    "130" : "SHEIK",
    "131" : "PRINCE AND PRINCESS",
    "132" : "YOUR IMPERIAL MAJEST",
    "135" : "M. ET MME.",
    "210" : "PROF.",
    '72': 'REVEREND', #because code 72002 REVEREND AND MRS, I think it was 072 and 027 for REVEREND 
    #because some systematic mistake (mistype), and in the end someone changed it in system
    '22': "COLONEL & MRS.", #just a mistype,
    
}

In [14]:
data['TCODE'] = data['TCODE'].apply(str)

data["cat_TCODE"] = data.TCODE.replace(to_replace=dictTCODE)

CHECKING FOR INCONSISTENCIES IN TCODE
- wrote down in doc all summary

In [28]:
# CHECKING FOR INCONSISTENCIES IN TCODE
print('Number of Men(GENDER M) with title MRS.(Mrs - womans title of married woman)', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'MRS.')]))
print('Number of men with title MISS (title of young or unmarried woman)', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'MISS')]))
print('Number of men with title MS. (title of married or not married woman)', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'MS.')]))
print('Number of women with title MR. (stays for mister, mans title)', 
      len(data[(data.GENDER == 'F')&(data.cat_TCODE == 'MR.')]))
print('Number of men with title Sister', 
      len(data[(data.GENDER == 'M')&(data.cat_TCODE == 'SISTER')]))
print('Number of women with title Brother', 
      len(data[(data.GENDER == 'F')&(data.cat_TCODE == 'BROTHER')]))
#women (F) that has code 'MR. & MRS.' that implies that they should be me
print('Number of women with title MR. & MRS., that implies that they should be men',
      len(data[(data.GENDER == 'F')&(data.cat_TCODE == 'MR. & MRS.')]))

Number of Men(GENDER M) with title MRS.(Mrs - womans title of married woman) 3412
Number of men with title MISS (title of young or unmarried woman) 34
Number of men with title MS. (title of married or not married woman) 216
Number of women with title MR. (stays for mister, mans title) 255
Number of men with title Sister 1
Number of women with title Brother 0
Number of women with title MR. & MRS., that implies that they should be men 46


In [21]:
p = data.loc[(data.cat_TCODE != '_')&(data.GENDER == '')][['GENDER','AGE', 'cat_TCODE']]

30710

In [20]:
dtale.show(data)

