In [3]:
from collections import Counter
import sys

import pandas as pd
import numpy as np
from pandas               import DataFrame
from pandas               import read_csv
from pandas               import to_numeric
from numpy                import array
from numpy                import random
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics      import auc
from sklearn.metrics      import roc_auc_score
from sklearn.cluster      import KMeans
from sklearn.cluster      import AgglomerativeClustering
from matplotlib           import pyplot
from mpl_toolkits.mplot3d import Axes3D 
from datetime             import datetime
import import_ipynb
from scipy.stats          import pearsonr
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.metrics      import auc
from sklearn.metrics      import roc_auc_score
from matplotlib           import pyplot
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

from sklearn.tree           import DecisionTreeClassifier
from sklearn.ensemble       import RandomForestClassifier
from sklearn.ensemble       import GradientBoostingClassifier
from sklearn.svm            import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors      import KNeighborsClassifier
from sklearn.metrics        import accuracy_score
from sklearn.metrics        import roc_auc_score
from sklearn.model_selection import RepeatedStratifiedKFold, ParameterGrid, GridSearchCV
from sklearn.preprocessing import StandardScaler  
from datetime import timedelta
from ieseg import partition
from ieseg import roc
from ieseg import cumulativeResponse
from ieseg import cumulativeGains
from ieseg import lift

In [4]:
def ReadData(Dataset, separation=';',lowMemory=True): 
    '''Read file into csv with a sep of ";", proceeds to print related info for the file and returns the created dataframe'''
    df= pd.read_csv(Dataset, sep=separation,low_memory=lowMemory)
    print(df.info())
    return df

def my_mode(sample):
    c = Counter(sample)
    return [k for k, v in c.items() if v == c.most_common(1)[0][1]]

# Function that interpret percentiles as RFM values
    # By creating 5 groups

def get_rfm_value(percentile_value):
    """Returns value 1-5 based on the percentile given"""
    if percentile_value <= 0.2:
        return 1
    elif percentile_value <= 0.4:
        return 2
    elif percentile_value <= 0.6:
        return 3
    elif percentile_value <= 0.8:
        return 4
    else:
        return 5


# Apply the Weighted Aggregation Approach to return overall RFM scores
    # We assign a 20% weight for the Recency value (6 because 5*4 = 20)
    # We assign a 30% weight for the Frequency value (4 because 5*6 = 30)
    # We assign a 50% weight for the Monetary value (10 because 5*10 = 50)
# In order to became its original value

def get_weigthed_agg(df):
    """Returns the sum of scaled R, F and M values."""
    new_R = df['R']*4
    new_F = df['F']*6
    new_M = df['M']*10
    return new_R + new_F + new_M

def Creating_Variables(campaign_number,gifts,donors,campaign_df= DataFrame,campaign_date=str): 
    # Select donors to the campaign only (Target) - TRAINING
    TARGET_donors = pd.merge(campaign_df, gifts.loc[gifts['campaignID'] == campaign_number,
                                                      ['donorID', 'amount']], how = 'left', on = 'donorID')
    # Replace 123,0 by 123.0 for the amount(format)
    TARGET_donors['amount']=TARGET_donors['amount'].astype(str).str.replace(',','.').astype(float)
    
    # Calculate the target variable for the TRAIN data
    TARGET_donors['amount']=np.where(TARGET_donors['amount'] >20, 1, 0)
    #rename to donated
    TARGET_donors = TARGET_donors.rename(columns ={"amount":"donated"})
    print('Target variable created succesfully, you are awesome!')
    
    # Select only the gifts made before the date of the campaign
    gifts['date']=pd.to_datetime(gifts['date'])
    #filtering in our TRAINING time window...
    gifts_ = gifts[gifts['date'] < campaign_date]
    
    # Aggregate the total amount of gift per donor (MONETARY) - TRAINING
    # Total_amount for the last 5 years
    campaign_date=pd.to_datetime(campaign_date)
    gifts_mon = gifts[(gifts['date'] > (campaign_date-timedelta(days=1825)))&(gifts['date'] < campaign_date)]
    total_amount = gifts_mon.groupby('donorID')['amount'].sum().rename('total_amount').reset_index()
    print('dates filtered succesfully, you are awesome')
    print('start_date of analysis: ',campaign_date-timedelta(days=1825))
    print('final date of analysis: ',campaign_date)
    print("Amount of donors that have donated in total more than 50,000EU (in 5 years): ",total_amount[total_amount['total_amount']>50000].shape[0])
    #if (total_amount[total_amount['total_amount']>50000].shape[0]>0):
        #print('Outliers, be careful') 
    #else 
        #print('No Outliers in MONETARY')
    # Aggregate the last date of donation for each donor (RECENCY) - TRAINING
    ##
    last_donation = gifts_mon.groupby('donorID')['date'].agg(['min', 'max']).reset_index()
    last_donation['LOR'] = last_donation['max'] - last_donation['min']
    last_donation = last_donation.rename(columns={'min':'date_first_donation', 'max':'date_last_donation'})
    last_donation['LOR'] = last_donation['LOR'].astype(str).str[:-5].astype(int)
    last_donation.loc[last_donation['LOR'] == 0, 'LOR'] =  1
    print('Number of donors: ',last_donation.shape[0])
    # Calculate the number of days since the last transaction - TRAINING
    last_donation['days_since_last_donation'] = pd.to_datetime(campaign_date,format = '%Y-%m-%d') - last_donation['date_last_donation']
    last_donation['days_since_last_donation'] = last_donation['days_since_last_donation'].dt.days
    ## Aggregate the number of gifts per donor - TRAINING
    # Nbr of gifts for the last 5 years
    frequency = gifts_mon.groupby(['donorID'])['amount'].count().rename('nbr_gifts').reset_index()
    # Calculate the RFM scores based on the total donation - TRAINING
    gifts_rfm = gifts_mon.groupby('donorID').agg({'date':'max', 'donorID':'count', 'amount':'sum'})
    gifts_rfm = gifts_rfm.rename(columns={'date':'latest_date',
                                             'donorID':'gifts_count',
                                             'amount':'total_amount'})
    # We calculate and assign RFM values to each client
    # We calculate the percentiles to figure out what are the clients' value
    percentiles = gifts_rfm.rank(pct=True, method='dense')
    percentiles.head()
    #apply our get rfm value
    rfm_values = percentiles.applymap(get_rfm_value)
    rfm_values.columns = ['R', 'F', 'M']
    #Use our get_weighted_agg function to present clearer socres.
    rfm_values['weighted_rfm_scores'] = rfm_values.apply(get_weigthed_agg, axis=1)
    # Select only the donorID and the weighted_rfm_scores for later merge

    rfm_values = rfm_values.reset_index()
    rfm_values = rfm_values.loc[:,['donorID', 'weighted_rfm_scores']].copy(deep=True)

    # Aggregate the average donation per campaign - TRAINING
    ### Aggregate the average donation per campaign
    avg_donation_per_campaign = gifts.groupby(['donorID']).agg({'amount':'sum','campaignID':'count'}).reset_index()
    avg_donation_per_campaign
    #creating the average in column
    avg_donation_per_campaign['avg_amount_per_campaign'] = np.where(avg_donation_per_campaign['campaignID']>0,
                                                                    round(avg_donation_per_campaign['amount']/avg_donation_per_campaign['campaignID'],2),
                                                                    avg_donation_per_campaign['amount'])


    #Average donation per campaign - fixing column names
    avg_donation_per_campaign = avg_donation_per_campaign.rename(columns={'campaignID':'nbr_campaign_donation'})
    avg_donation_per_campaign = avg_donation_per_campaign[['donorID', 'nbr_campaign_donation','avg_amount_per_campaign']]
    avg_donation_per_campaign
    # Aggregate the average donation per donor - TRAINING
    avg_donation_per_donor = gifts.groupby('donorID')['amount'].mean().rename('avg_donation').reset_index()
    avg_donation_per_donor['avg_donation'] = round(avg_donation_per_donor['avg_donation'],2)
    
    #extracting the year.
    donors['dateOfBirth']=pd.to_datetime(donors['dateOfBirth'])
    donors['yearOfBirth']=donors['dateOfBirth'].astype(str).str[:4].astype(int)
    donors['yearOfBirth']
    donors_campaign= donors.copy(deep=True)

    #generating the age...
    donors_campaign['age'] = campaign_date.year - donors_campaign['yearOfBirth']
    print("year of the campaign: ",campaign_date.year)
    # life expectancy = https://www.healthybelgium.be/en/health-status/life-expectancy-and-quality-of-life/life-expectancy
    donors_campaign=donors_campaign[donors_campaign['age']<82]

    #Creating a dummy for people under 30 or over 30 
    donors_campaign['age_dummy_30']= np.where(donors_campaign['age']< 30, 0, 1)
    
    
    merged1 = pd.merge(TARGET_donors, total_amount, how = 'left', on = 'donorID')
    merged2 = pd.merge(merged1, last_donation, how = 'left', on = 'donorID')
    merged3 = pd.merge(merged2, frequency, how = 'left', on = 'donorID')
    merged4 = pd.merge(merged3, rfm_values, how = 'left', on = 'donorID')
    merged5 = pd.merge(merged4, avg_donation_per_campaign, how = 'left', on = 'donorID')
    merged6 = pd.merge(merged5, avg_donation_per_donor, how = 'left', on = 'donorID')
    merged_final = pd.merge(merged6, donors_campaign, how = 'left', on = 'donorID').drop_duplicates(subset='donorID',keep='last')
    print('variable creation DONE')
    #final table.
    return merged_final

def Fixing_Nan(df,donors):
    
    df=df.dropna(how='all')

    fixed_df=df[df['donorID'].isin(donors['donorID'])]

    #taking only people in the campaign that are present in our donors database
    print("before filter: ",df.shape[0])
    print("after filter: ",fixed_df.shape[0])

    #flag variable for NaN
    fixed_df.loc[:,'not_donated_in_the_past'] = np.where(fixed_df.loc[:,'LOR'].isna(), 1, 0)
    fixed_df=fixed_df.drop(['date_first_donation', 'date_last_donation','dateOfBirth'], axis=1)
    fixed_df=fixed_df.fillna(0)
    fixed_df=fixed_df.drop('yearOfBirth',axis=1)
    
    
    return fixed_df

def Feature_Engineering(df, test_df): 
    variables=df.drop('donated',axis=1)
    listPC=[]
    for (columnName, columnData) in variables.iteritems():
        print('Column Name : ', columnName, '///Correlation and P value: ',pearsonr(variables[columnName],df['donated']))
    for (columnName, columnData) in variables.iteritems():
        if pearsonr(variables[columnName],df['donated'])[1]<0.001:
            print("Use this variable: ",columnName)
            listPC.append(columnName)
    #Creating Pipeline (Transforming dataframe for ML Algorithm)
    NumFeat=variables.drop(['donorID'],axis=1)#[listPC]
    print("Standardizing the following variables: ",listPC)
    standard_scaler = StandardScaler()  
    fitted = standard_scaler.fit(df[listPC])  
    transformed_df = pd.DataFrame(fitted.transform(df[listPC]),columns=listPC)
    transformed_df=transformed_df.join(df[['donorID','donated']].reset_index().drop('index',axis=1))
    
    transformed_df_test = pd.DataFrame(fitted.transform(test_df[listPC]),columns=listPC)
    transformed_df_test=transformed_df_test.join(test_df[['donorID','donated']].reset_index().drop('index',axis=1))
    print("standarization FINISHED SUCCESFULLY")
    
    
    return transformed_df,transformed_df_test,listPC

def stepwiseRegresion (model,trainingSet: DataFrame, testSet: DataFrame, selectedFeatures: [str], target: [str]) -> DataFrame:

 

    def computeAUC (forFeatures: [str]) -> ([str],float,float):
    
        model.fit(trainingSet[forFeatures], trainingSet[target])

 

        trainingSet["proba churn stepwise"] = DataFrame(model.predict_proba(trainingSet[forFeatures]))[1]
        testSet["proba churn stepwise"]     = DataFrame(model.predict_proba(testSet[forFeatures]))[1]

 

        aucTraining = roc_auc_score(array(trainingSet[target]),array(trainingSet["proba churn stepwise"]))
        aucTest     = roc_auc_score(array(testSet[target]),array(testSet["proba churn stepwise"]))

 

        trainingSet.drop("proba churn stepwise", axis = 1)
        testSet.drop("proba churn stepwise", axis = 1)

 

        return (forFeatures,aucTraining,aucTest)

 

    featuresOrder = []
    forwardSelection = []

 

    for step in range(len(selectedFeatures)):
        print(f"step {step+1}")
        aucs = []
        for feature in selectedFeatures:
            if feature not in featuresOrder:
                modelFeatures = featuresOrder.copy()
                modelFeatures.append(feature)
                aucs.append(computeAUC(forFeatures = modelFeatures))

 

        steps = DataFrame(aucs)
        steps.columns = ["Feature","AUC training", "AUC test"]
        steps = steps.sort_values(by=["AUC test"], ascending = False)

 

        featuresOrder = steps["Feature"].iloc[0]
        forwardSelection.append((step+1, steps["Feature"].iloc[0],steps["AUC training"].iloc[0],steps["AUC test"].iloc[0]))

 

    df = DataFrame(forwardSelection)
    df.columns = ("Step","Features","AUC Train","AUC Test")
    
    return df