In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from itertools import product
from pandas_profiling import ProfileReport
import pickle
import os

In [None]:
#loading dataset into a variable called donors
donors = pd.read_csv('donors.csv')

In [None]:
#getting some information about the dataset
donors.info()

In [None]:
#creating a copy of the dataset
data = donors.copy()

In [None]:
#getting the correlation between two columns that seem to be showing the same data 
#as the values are high, we can drop one set of data points as the data is redundant 
print(spearmanr(data['MALEMILI'], data['AFC2']))
print(spearmanr(data['MALEVET'], data['AFC5']))
print(spearmanr(data['VIETVETS'], data['VC1']))
print(spearmanr(data['WWIIVETS'], data['VC3']))
print(spearmanr(data['LOCALGOV'], data['OEDC1']))
print(spearmanr(data['STATEGOV'], data['OEDC2']))
print(spearmanr(data['FEDGOV'], data['OEDC3']))

In [None]:
print(spearmanr(data['POP901'], data['POP902']))
print(spearmanr(data['POP901'], data['POP903']))
print(spearmanr(data['POP902'], data['POP903']))

In [None]:
#selecting only columns with metric data
metric_data = data.select_dtypes(include=np.number)

In [None]:
#putting the metric features and non metric features into lists
metric_features = data.select_dtypes(include=np.number).columns.tolist()
non_metric_features = data.columns.drop(metric_features).to_list()

In [None]:
print(len(metric_features))
print(len(non_metric_features))

## neighbourhood data

In [None]:
# give column name 
col_name_1 = "POP901"
col_name_2 = "ADATE_2"
col_name_3 = "MALEMILI"
col_name_4 = "SOLP3"
col_name_5 = "CONTROLN"
  
# find the index no 
index_no_1 = data.columns.get_loc(col_name_1)
index_no_2 = data.columns.get_loc(col_name_2)
index_no_3 = data.columns.get_loc(col_name_3)
index_no_4 = data.columns.get_loc(col_name_4)
index_no_5 = data.columns.get_loc(col_name_5)

#creating data frame for only the neighbourhood data 
nbh_data = data.iloc[:, index_no_1:index_no_2]
nbh_data_2 = data.iloc[:,index_no_3:index_no_4]
df_control = data.iloc[:,index_no_5]

df_neighbours = nbh_data.join(nbh_data_2)
ndf = df_neighbours.join(df_control)

In [None]:
ndf.drop(columns=['MSA','ADI','DMA'],inplace=True)

In [None]:
#nbh_corrMatrix = ndf.corr()

In [None]:
#list of metric features 
nbh_metric_features = ndf.columns.to_list()

In [None]:
ndf.shape[1]/len(metric_features)*100

## Sectioning neighbourhood data

In [None]:
Veterans1 = ndf.iloc[:, ndf.columns.get_loc('MALEMILI'):ndf.columns.get_loc('LOCALGOV')]
Veterans2 = ndf.iloc[:, ndf.columns.get_loc('AFC1'):ndf.columns.get_loc('ANC1')]
Population = ndf.iloc[:, ndf.columns.get_loc('POP901'):ndf.columns.get_loc('ETH1')]
Age1 = ndf.iloc[:, ndf.columns.get_loc('AGE901'):ndf.columns.get_loc('HHAGE1')]
Age2 = ndf.iloc[:, ndf.columns.get_loc('AC1'):ndf.columns.get_loc('MALEMILI')]
Movement = ndf.iloc[:, ndf.columns.get_loc('MC1'):ndf.columns.get_loc('TPE1')]
Ethnicity1 = ndf.iloc[:, ndf.columns.get_loc('ETH1'):ndf.columns.get_loc('AGE901')]
Ethnicity2 = ndf.iloc[:, ndf.columns.get_loc('ETHC1'):ndf.columns.get_loc('HVP1')]
Ancestry = ndf.iloc[:, ndf.columns.get_loc('ANC1'):ndf.columns.get_loc('VOC1')]
Household1 = ndf.iloc[:, ndf.columns.get_loc('HHAGE1'):ndf.columns.get_loc('ETHC1')]
Household2 = ndf.iloc[:, ndf.columns.get_loc('HVP1'):ndf.columns.get_loc('IC1')]
Household3 = ndf.iloc[:, ndf.columns.get_loc('VOC1'):ndf.columns.get_loc('AC1')]
Income = ndf.iloc[:, ndf.columns.get_loc('IC1'):ndf.columns.get_loc('MC1')]
Transportation = ndf.iloc[:, ndf.columns.get_loc('TPE1'):ndf.columns.get_loc('LFC1')]
Employment1 = ndf.iloc[:, ndf.columns.get_loc('OCC1'):ndf.columns.get_loc('EC1')]
Employment2 = ndf.iloc[:, ndf.columns.get_loc('LOCALGOV'):ndf.columns.get_loc('CONTROLN')]
LabourForce = ndf.iloc[:, ndf.columns.get_loc('LFC1'):ndf.columns.get_loc('OCC1')]
Education = ndf.iloc[:, ndf.columns.get_loc('EC1'):ndf.columns.get_loc('AFC1')]

In [None]:
Veterans = Veterans1.join(Veterans2).join(df_control)
Population = Population.join(Movement).join(df_control)
Age = Age1.join(Age2).join(df_control)
Ethnicity = Ethnicity1.join(Ethnicity2).join(df_control)
Ancestry = Ancestry.join(df_control)
Household = Household1.join(Household2).join(Household3).join(df_control)
Income = Income.join(df_control)
Transportation = Transportation.join(df_control)
Employment = Employment1.join(Employment2).join(df_control)
LabourForce = LabourForce.join(df_control)
Education = Education.join(df_control)

## Imputation

In [None]:
columns_impute = ['POP90C4','POP90C5','AGE901','AGE902','AGE903', 'AGE904','AGE905','AGE906','AGE907','CHIL1','CHIL2','CHIL3','AGEC1','AGEC2','AGEC3','AGEC4','AGEC5','CHILC1','CHILC2','CHILC3','CHILC4','CHILC5','HHN1','HHN2','HHN3','HHN4','MARR1','HHP1','HHP2','HU1','HU3','HHD1','HHD2','HHD3','HHD4','HHD5','ETHC1','ETHC2','RHP1','RHP2','RP4','IC1','IC2','IC3','IC4','HHAS3','MC1','MC2','TPE1','TPE13','LFC1','LFC2','LFC3','LFC4','LFC5','OEDC5','EC4','AFC5','VC1','VC2','VC3','VC4','POBC2','VOC2','MHUC1','MHUC2','MALEVET','VIETVETS','WWIIVETS']
dataframes = [Veterans,Population,Age,Ethnicity,Ancestry,Household,Income,Transportation,Employment,LabourForce,Education]

In [None]:
def impute_values(df):
    for column in df.columns.to_list():
        if column in columns_impute:
            df[column].replace(0, np.nan, inplace=True)
            # KNNImputer or SimpleImputer
            imputer = KNNImputer()
            df[df.columns.to_list()] = imputer.fit_transform(df[df.columns.to_list()])
            
def treat_outliers(df, m):
    q25 = df.quantile(.25)
    q75 = df.quantile(.75)
    iqr = (q75 - q25)

    upper_lim = q75 + m * iqr
    lower_lim = q25 - m * iqr

    filters = []
    for metric in df.columns.to_list():
        llim = lower_lim[metric]
        ulim = upper_lim[metric]
        filters.append(df[metric].between(llim, ulim, inclusive=True))

    df_2 = df[np.all(filters, 0)]
    print('Percentage of data kept after removing outliers:', np.round(df_2.shape[0] / df.shape[0], 4))
    
def plot_dist(df):
    # All Numeric Variables' Histograms in one figure
    sns.set()

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(df.columns.to_list()) / 2), figsize=(20, 11))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), df.columns.to_list()): # Notice the zip() function and flatten() method
        ax.hist(df[feat])
        ax.set_title(feat, y=-0.13)
    
    # Layout
    # Add a centered title to the figure:
    title = "Numeric Variables' Histograms"

    plt.suptitle(title)

    plt.show()

In [None]:
dataframes_cluster = [Population, Age, Income, Transportation, Education, Ethnicity, Veterans]

In [None]:
for df in dataframes:
    df.set_index('CONTROLN', inplace = True)

In [None]:
for df in dataframes_cluster:
    impute_values(df) 

In [None]:
Household.set_index('CONTROLN', inplace = True)
Employment.set_index('CONTROLN', inplace = True)
LabourForce.set_index('CONTROLN', inplace = True)
Ancestry.set_index('CONTROLN', inplace = True)
Population.set_index('CONTROLN', inplace = True)
Age.set_index('CONTROLN', inplace = True)
Education.set_index('CONTROLN', inplace = True)
Income.set_index('CONTROLN', inplace = True)
Transportation.set_index('CONTROLN', inplace = True)
Ethnicity.set_index('CONTROLN', inplace = True)
Veterans.set_index('CONTROLN', inplace = True)

In [None]:
data_imputed = Population.join(Age).join(Income).join(Transportation).join(Education).join(Ethnicity).join(Veterans)                                                                                    

## PCA

In [None]:
# list of columns that are not percentages
non_percentage = ['POP901','POP902','POP903','AGE901','AGE902','AGE903','AGE904','AGE905','AGE906','HHP1','HHP2','HV1','HV2','HV3','HV4','RHP1','RHP2','RHP3','RHP4','MHUC1','MHUC2','IC1','IC2','IC3','IC4','IC5','TPE10','TPE11','EC1']

In [None]:
def prep_data(df):
    metric_feat = df.columns.to_list()
    scaler = MinMaxScaler()
    
    to_scale = []
    for feature in metric_feat:
        if feature not in non_percentage:
            df[feature] = df[feature]/100
        elif feature in non_percentage:
            to_scale.append(feature)

    if len(to_scale) > 0:
        df[to_scale] = scaler.fit_transform(df[to_scale])

def pca_analysis(df):
    df_pca = df.copy()
    metric_feat = df_pca.columns.to_list()
    pca = PCA()
    pca_feat = pca.fit_transform(df_pca[metric_feat])
    
    
    #how many principal components to retain
    # Output PCA table
    pca_result = pd.DataFrame(
        {"Eigenvalue": pca.explained_variance_,
         "Difference": np.insert(np.diff(pca.explained_variance_), 0, 0),
         "Proportion": pca.explained_variance_ratio_,
         "Cumulative": np.cumsum(pca.explained_variance_ratio_)},
        index=range(1, pca.n_components_ + 1)
        )
    print(pca_result.head(25))
    
    # figure and axes
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    # draw plots
    ax1.plot(pca.explained_variance_, marker=".", markersize=12)
    ax2.plot(pca.explained_variance_ratio_, marker=".", markersize=12, label="Proportion")
    ax2.plot(np.cumsum(pca.explained_variance_ratio_), marker=".", markersize=12, linestyle="--", label="Cumulative")

    # customizations
    ax2.legend()
    ax1.set_title("Scree Plot", fontsize=14)
    ax2.set_title("Variance Explained", fontsize=14)
    ax1.set_ylabel("Eigenvalue")
    ax2.set_ylabel("Proportion")
    ax1.set_xlabel("Components")
    ax2.set_xlabel("Components")
    ax1.set_xticks(range(0, pca.n_components_, 2))
    ax1.set_xticklabels(range(1, pca.n_components_ + 1, 2))
    ax2.set_xticks(range(0, pca.n_components_, 2))
    ax2.set_xticklabels(range(1, pca.n_components_ + 1, 2))

    plt.show()
    
#interpreting each principal component
def _color_red_or_green(val):
    if val < -0.45:
        color = 'background-color: red'
    elif val > 0.45:
        color = 'background-color: green'
    else:
        color = ''
    return color

# Perform PCA again with the number of principal components you want to retain
def pca_describe(df,n_components):
    pca = PCA(n_components)
    df_pca = df.copy()
    metric_feat = df_pca.columns.to_list()
    pca_feat = pca.fit_transform(df_pca[metric_feat])
    pca_feat_names = [f"PC{i}" for i in range(pca.n_components_)]
    pca_df = pd.DataFrame(pca_feat, index=df_pca.index, columns=pca_feat_names)  # remember index=df_pca.index
    # Reassigning df to contain pca variables
    df_pca = pd.concat([df_pca, pca_df], axis=1)

    # Interpreting each Principal Component
    loadings = df_pca[metric_feat + pca_feat_names].corr().loc[metric_feat, pca_feat_names]
    return loadings.style.applymap(_color_red_or_green)

def pca_apply(df,n_components):
    pca = PCA(n_components)
    df_pca = df.copy()
    metric_feat = df_pca.columns.to_list()
    pca_feat = pca.fit_transform(df_pca[metric_feat])
    pca_feat_names = [f"PC{i}" for i in range(pca.n_components_)]
    pca_df = pd.DataFrame(pca_feat, index=df_pca.index, columns=pca_feat_names)  # remember index=df_pca.index
    return pca_df   

In [None]:
dataframes = [Veterans,Population,Age,Ethnicity,Ancestry,Household,Income,Transportation,Employment,LabourForce,Education]
dataframes = [prep_data(df) for df in dataframes]

In [None]:
dataframes = [Veterans,Population,Age,Ethnicity,Ancestry,Household,Income,Transportation,Employment,LabourForce,Education]
dataframes_names = ['Veterans','Population','Age','Ethnicity','Ancestry','Household','Income','Transportation','Employment','LabourForce','Education']

In [None]:
for df, name in zip(dataframes, dataframes_names):
    df.name = name 
    print(df.name)
    pca_analysis(df)

In [None]:
dataframes = [Veterans,Population,Age,Ethnicity,Ancestry,Household,Income,Transportation,Employment,LabourForce,Education]

In [None]:
pca_describe(Veterans,6) # + correl with WWII vets 

In [None]:
pca_describe(Population,6) # PC0 + correlated with rural area

In [None]:
pca_describe(Age,6) # positive correlated with higher age of pop

In [None]:
pca_describe(Ethnicity,6) # negatively correlated with white population and positively correlated with black pop

In [None]:
pca_describe(Ancestry,6) # difficult to understand the PCs

In [None]:
pca_describe(Household,6) # difficult to understand the PCs

In [None]:
pca_describe(Income,6) # + correlated with higher income 

In [None]:
pca_describe(Transportation,6) # positively correlated with time to work

In [None]:
pca_describe(Employment,6) # difficult to understand the PCs

In [None]:
pca_describe(LabourForce,6) # difficult to understand the PCs

In [None]:
pca_describe(Education,6) # positively correlated with education level

In [None]:
Pop_PC = pca_apply(Population,1)
Pop_PC.rename(columns={"PC0": "Population"},inplace = True)

In [None]:
Age_PC = pca_apply(Age,1) #check the correlation with age 
Age_PC.rename(columns={"PC0": "Age"},inplace = True)

In [None]:
Education_PC = pca_apply(Education,1) # Median year of school completed, % 25+ with bachelors/graduate degree (- correlated with higher education)
Education_PC.rename(columns={"PC0": "Education"},inplace = True)

In [None]:
Income_PC = pca_apply(Income,1) # household income (PC0 pn[ 0+ correlated with higher income)
Income_PC.rename(columns={"PC0": "Income"},inplace = True)

In [None]:
Transportation_PC = pca_apply(Transportation,1) #time to work
Transportation_PC.rename(columns={"PC0": "Transport"},inplace = True)

In [None]:
Ethnicity_PC = pca_apply(Ethnicity,1) #time to work
Ethnicity_PC.rename(columns={"PC0": "Ethnicity"},inplace = True)

In [None]:
Veterans_PC = pca_apply(Veterans,1)
Veterans_PC.rename(columns={"PC0": "Veterans"},inplace = True)

In [None]:
data_cluster = Pop_PC.join(Age_PC).join(Education_PC).join(Income_PC).join(Transportation_PC).join(Ethnicity_PC).join(Veterans_PC)

In [None]:
data_cluster

In [None]:
raw_feat_cluster = Population.join(Age).join(Education).join(Income).join(Transportation).join(Veterans).join(Ethnicity)

In [None]:
data_cluster.corr()

In [None]:
treat_outliers(data_cluster,4)

In [None]:
data_cluster_2.shape[0]/data_cluster.shape[0]

In [None]:
outliers = data_cluster[~data_cluster.isin(data_cluster_2)].dropna()

In [None]:
other = Household.join(Employment).join(LabourForce).join(Ancestry)

In [None]:
pca_analysis(other)

In [None]:
pca_apply(other,6) 

## Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(data_cluster)

In [None]:
kmeans.predict(data_cluster)

In [None]:
range_clusters = range(1, 11)

In [None]:
inertia = []
for n_clus in range_clusters:  # iterate over desired ncluster range
    kmclust = KMeans(n_clusters=n_clus, init='k-means++', n_init=15, random_state=1)
    kmclust.fit(data_cluster)
    inertia.append(kmclust.inertia_)  # save the inertia of the given cluster solution

In [None]:
# The inertia plot
plt.figure(figsize=(9,5))
plt.plot(inertia)
plt.ylabel("Inertia: SSw")
plt.xlabel("Number of clusters")
plt.title("Inertia plot over clusters", size=15)
plt.show()

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm

In [None]:
# Adapted from:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

# Storing average silhouette metric
avg_silhouette = []
for nclus in range_clusters:
    # Skip nclus == 1
    if nclus == 1:
        continue
    
    # Create a figure
    fig = plt.figure(figsize=(13, 7))

    # Initialize the KMeans object with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    kmclust = KMeans(n_clusters=nclus, init='k-means++', n_init=15, random_state=1)
    cluster_labels = kmclust.fit_predict(data_cluster)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(data_cluster, cluster_labels)
    avg_silhouette.append(silhouette_avg)
    print(f"For n_clusters = {nclus}, the average silhouette_score is : {silhouette_avg}")

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(data_cluster, cluster_labels)

    y_lower = 10
    for i in range(nclus):
        # Aggregate the silhouette scores for samples belonging to cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        
        # Get y_upper to demarcate silhouette y range size
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        
        # Filling the silhouette
        color = cm.nipy_spectral(float(i) / nclus)
        plt.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    plt.title("The silhouette plot for the various clusters.")
    plt.xlabel("The silhouette coefficient values")
    plt.ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    plt.axvline(x=silhouette_avg, color="red", linestyle="--")
    
    # The silhouette coefficient can range from -1, 1
    xmin, xmax = np.round(sample_silhouette_values.min() -0.1, 2), np.round(sample_silhouette_values.max() + 0.1, 2)
    plt.xlim([xmin, xmax])
    
    # The (nclus+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    plt.ylim([0, len(data_cluster) + (nclus + 1) * 10])

    plt.yticks([])  # Clear the yaxis labels / ticks
    plt.xticks(np.arange(xmin, xmax, 0.1))

In [None]:
# The average silhouette plot
# The inertia plot
plt.figure(figsize=(9,5))
plt.plot(avg_silhouette)
plt.ylabel("Average silhouette")
plt.xlabel("Number of clusters")
plt.title("Average silhouette plot over clusters", size=15)
plt.show()

In [None]:
#clustering_names = ['MiniBatchKMeans', 'AffinityPropagation', 'MeanShift','SpectralClustering', 'Ward', 'AgglomerativeClustering','DBSCAN', 'Birch']

## PCA on all neighbourhood

In [None]:
prep_data(ndf)

In [None]:
df_pca=ndf.copy()
# Use PCA to reduce dimensionality of data
pca = PCA()
pca_feat = pca.fit_transform(df_pca[nbh_metric_features])
pca_feat

In [None]:
# Obtaining the projected observations on the principal components axes (linear combinations)
pd.DataFrame(df_pca[nbh_metric_features].values @ pca.components_.T, 
             index=df_pca.index,
             columns=[f"PC{i}" for i in range(pca.n_components_)])

In [None]:
#how many principal components to retain
# Output PCA table
pca_result = pd.DataFrame(
    {"Eigenvalue": pca.explained_variance_,
     "Difference": np.insert(np.diff(pca.explained_variance_), 0, 0),
     "Proportion": pca.explained_variance_ratio_,
     "Cumulative": np.cumsum(pca.explained_variance_ratio_)},
    index=range(1, pca.n_components_ + 1)
)

In [None]:
pca_result

In [None]:
# figure and axes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# draw plots
ax1.plot(pca.explained_variance_, marker=".", markersize=12)
ax2.plot(pca.explained_variance_ratio_, marker=".", markersize=12, label="Proportion")
ax2.plot(np.cumsum(pca.explained_variance_ratio_), marker=".", markersize=12, linestyle="--", label="Cumulative")

# customizations
ax2.legend()
ax1.set_title("Scree Plot", fontsize=14)
ax2.set_title("Variance Explained", fontsize=14)
ax1.set_ylabel("Eigenvalue")
ax2.set_ylabel("Proportion")
ax1.set_xlabel("Components")
ax2.set_xlabel("Components")

ax1.set_xticks(range(0, pca.n_components_, 2))
ax1.set_xticklabels(range(1, pca.n_components_ + 1, 2))
ax2.set_xticks(range(0, pca.n_components_, 2))
ax2.set_xticklabels(range(1, pca.n_components_ + 1, 2))

plt.show()

In [None]:
# Perform PCA again with the number of principal components you want to retain
pca = PCA(n_components=1)
pca_feat = pca.fit_transform(df_pca[nbh_metric_features])
pca_feat_names = [f"PC{i}" for i in range(pca.n_components_)]
pca_df = pd.DataFrame(pca_feat, index=df_pca.index, columns=pca_feat_names)  # remember index=df_pca.index

## EXPORT

In [None]:
os.getcwd()

In [None]:
computed_data_path = 'computed_data/'

if not os.path.exists(computed_data_path): 
    os.makedirs(computed_data_path)

In [None]:
with open(os.path.join(computed_data_path, 'neighborhood_feat_raw.pickle'), 'wb') as f:
    pickle.dump(raw_feat_cluster, f)
    
with open(os.path.join(computed_data_path, 'neighborhood_PC_cluster.pickle'), 'wb') as f:
    pickle.dump(data_cluster_2, f)
    
with open(os.path.join(computed_data_path, 'neighborhood_feat_after_impute.pickle'), 'wb') as f:
    pickle.dump(data_imputed, f)

with open(os.path.join(computed_data_path, 'neighborhood_outliers.pickle'), 'wb') as f:
    pickle.dump(outliers, f)

## Profile reports

In [None]:
profile = ProfileReport(
    df_neighbours,
    title='Neighbourhood 2',
    minimal=True
)
profile.to_file('explore_nbh_2.html')


In [None]:
#df.to_csv(os.path.join("..", "data", "donors_preprocessed.csv"), index=False)