In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_profiling import ProfileReport

%matplotlib inline
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'


In [None]:
donors = pd.read_csv('data/donors.csv')

In [None]:
donors.head()

profile = ProfileReport(
    donors, 
    title='Donors Raw',
    minimal=True
)
profile.to_file('explorations/profile_donorsRaw.html')

In [None]:
mixed_dtypes = donors.iloc[:,[9,412]].copy(deep=True)
mixed_dtypes.dtypes
mixed_dtypes.NOEXCH.value_counts()
mixed_dtypes.n.value_counts()

In [None]:
donors.MAILCODE.isna().sum()
donors.MAILCODE[0]
donors.MAILCODE.unique()

In [None]:
test = donors.iloc[:,0:20].copy(deep=True)
test.info()

In [None]:
uniq_vals_per_col = donors.apply(lambda x: len(x.unique()), 0)
uniq_vals_per_col

In [None]:
uniq_vals_per_col[uniq_vals_per_col <=1]

In [None]:
donors.shape

In [None]:
donors.RECINHSE.value_counts()

In [None]:
donors.dtypes.value_counts()


In [None]:
donors.loc[:, donors.dtypes == 'object']

In [None]:
donors.MDMAUD

In [None]:
plt.plot(donors.MALEMILI.value_counts())
plt.show()

In [None]:
donors.MALEVET.value_counts()

In [None]:
donors.GENDER.value_counts(dropna=False)

In [None]:
metric_features = donors.loc[:, donors.dtypes != 'object'].columns
metric_features

In [None]:
# Prepare figure
fig = plt.figure(figsize=(10, 8))

# Obtain correlation matrix. Round the values to 2 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(donors[metric_features].corr(method="pearson"), decimals=2)

# Build annotation matrix (values above |0.5| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.5
annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does

# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(220, 10, as_cmap=True), 
            fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)

# Layout
fig.subplots_adjust(top=0.95)
fig.suptitle("Correlation Matrix", fontsize=20)

plt.savefig('explorations/corrMat_donorsRaw', dpi=200)

plt.show()

In [None]:
#corr[corr > 0.9]
corr > 0.9

In [None]:
donors.columns[donors.columns.str.contains('rfa', case=False)]

In [None]:
check_rfa =donors[['MDMAUD', 'RFA_2', 'RFA_2R', 'RFA_2F', 'RFA_2A']].copy(deep=True)
check_rfa['RFA_2_build'] = check_rfa[['RFA_2R', 'RFA_2F', 'RFA_2A']].apply(lambda x: ''.join(x.astype(str)), axis=1)


In [None]:
(check_rfa['RFA_2'] == check_rfa['RFA_2_build']).all()

In [None]:
check_rfa

In [None]:
donors.MDMAUD_A.value_counts()

In [None]:
donors.TCODE.value_counts(dropna=False)

In [None]:
(donors.WEALTH1 != donors.WEALTH2).sum()

In [None]:
donors[(donors.WEALTH1 != donors.WEALTH2)][['WEALTH1', 'WEALTH2']]

In [None]:
(donors.WEALTH1.isna() & donors.WEALTH2.isna()).sum() / donors.shape[0]

In [None]:
cols_to_drop = [
    'TCODE', # duplicate with gender 
]
cols_to_drop
