# Focus on FUR and FCS Transactions  

by Fred Etter - November, 2019

In [None]:
# Import modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import sklearn
from sklearn.feature_selection import SelectFromModel
from datetime import datetime
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
# Read in the data

df = pd.read_csv('sends.csv', low_memory=False)
df_march = pd.read_csv('march.csv', low_memory=False)
df_april = pd.read_csv('april.csv', low_memory=False)
df_may = pd.read_csv('may.csv', low_memory=False)

In [None]:
# create new dataframs for fcs and fur transactions - JUST FOR MARCH
df_fcs = df_march.loc[df_march['ACTIVITY_TYPE'] == 'FCS']
df_fur = df_march.loc[df_march['ACTIVITY_TYPE'] == 'FUR']

In [None]:
# create a dataframe that captures all transactions for all 3 months
df_t1_all = pd.concat([df_march, df_april], sort=False)
df_t_all = pd.concat([df_t1_all, df_may], sort=False)

In [None]:
# all transactions has almost 5 million rows
df_t_all.shape

In [None]:
# build new dataframe for all months for fcs and fur
df_t_all_fcs = df_t_all.loc[df_t_all['ACTIVITY_TYPE'] == 'FCS']
df_t_all_fur = df_t_all.loc[df_t_all['ACTIVITY_TYPE'] == 'FUR']

In [None]:
# look at sample data for all fcs transactions

pd.set_option('display.max_columns', None)
print(df_t_all_fcs.shape)
df_t_all_fcs.sample(5)

In [None]:
# getting unique supporter_ids for all fcs transactions
df_dd = df_t_all_fcs.drop_duplicates(subset='SUPPORTER_ID')

In [None]:
# This is the number of unique supporters who made a FCS transaction - 22433
df_dd.shape

In [None]:
df_dd.head()

In [None]:
df_dd.shape

In [None]:
df_dd.reset_index(inplace=True)

In [None]:
df_dd.head()

In [None]:
df_dd.drop(columns=['index', 'COUNTRY', 'ACTIVITY_TYPE', 'ACTIVITY_DATETIME', 'ACTIVITY_ID', 'ACTIVITY_STATUS', 'ACTIVITY_DATA_1', 
                   'ACTIVITY_DATA_2', 'ACTIVITY_DATA_3', 'ACTIVITY_DATA_4', 'ACTIVITY_DATA_6', 'ACTIVITY_DATA_7', 'ACTIVITY_DATA_8',
                   'ACTIVITY_DATA_9', 'ACTIVITY_DATA_10', 'ACTIVITY_DATA_26', 'ACTIVITY_DATA_27', 'ACTIVITY_DATA_28', 
                   'ACTIVITY_DATA_29', 'ACTIVITY_DATA_30'], inplace=True)

In [None]:
df1 = df_dd.copy()

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
df1.SUPPORTER_ID.nunique()

In [None]:
# renaming some columns
df1 = df1.rename(columns={'SUPPORTER_ID': 'supp_id', 'ACTIVITY_DATA_5': 'currency'})

In [None]:
df1.head()

#### FUR or not ?  (1 or 0):

In [None]:
# Add the column 'fur' where it equals 1 if the supporter also contributed monthly (has a FUR transaction)
df1['fur'] = np.where(df1.supp_id.isin(df_t_all_fur.SUPPORTER_ID), 1, 0)

In [None]:
df1.shape

In [None]:
df1.head() 

In [None]:
# display number of rows, columns for supporters who made a FCS and FUR transaction
df1.loc[df1['fur'] == 1].shape

#### Number of FCS transactions by unique supporter:

In [None]:

df_temp = df_t_all_fcs.groupby('SUPPORTER_ID').count()

In [None]:
df1 = df1.merge(df_temp[['ACTIVITY_ID']], left_on=df1.supp_id, right_on=df_temp.index)

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
df1.drop(columns=['key_0'], inplace=True)

In [None]:
df1.head()

In [None]:
# renaming some columns
df1 = df1.rename(columns={'ACTIVITY_ID': 'total_fcs'})

In [None]:
df1.head()

#### Maximum FCS transaction:

In [None]:
df_temp = pd.DataFrame()

In [None]:
df_temp['max_fcs'] = df_t_all_fcs.groupby('SUPPORTER_ID', sort=False)['ACTIVITY_DATA_26'].max()

In [None]:
df_temp.head()

In [None]:
df_temp.shape

In [None]:
df1 = df1.merge(df_temp, left_on=['supp_id'], right_on=df_temp.index)

In [None]:
df1.head()

#### Total transactions:

In [None]:
df_temp = df_t_all.groupby('SUPPORTER_ID').count()

In [None]:
df1 = df1.merge(df_temp[['ACTIVITY_ID']], left_on=df1.supp_id, right_on=df_temp.index)

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
# drop unnecessary column
df1.drop(columns=['key_0'], inplace=True)

# renaming some columns
df1 = df1.rename(columns={'ACTIVITY_ID': 'total_trans'})

In [None]:
df1.head()

#### Sum of total FCS donations by supporter:

In [None]:
df_temp = pd.DataFrame()

In [None]:
df_temp['sum_fcs'] = df_t_all_fcs.groupby('SUPPORTER_ID', sort=False)['ACTIVITY_DATA_26'].sum()

In [None]:
df1 = df1.merge(df_temp, left_on=['supp_id'], right_on=df_temp.index)

In [None]:
df1.shape

In [None]:
df1.head()

#### Number of Ps:

In [None]:
df_p = df_t_all.loc[df_t_all['ACTIVITY_STATUS'] == 'P']

In [None]:
df_p = df_p.groupby('SUPPORTER_ID', sort=False).count()

In [None]:
df_p.head()

In [None]:
df1.shape

In [None]:
df1 = df1.merge(df_p['ACTIVITY_ID'], how='left', left_on='supp_id', right_index=True)

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
# renaming some columns
df1 = df1.rename(columns={'ACTIVITY_ID': 'num_P'})

In [None]:
df1.head()

#### Number of 'formsub' transactions:

In [None]:
# create new df for only 'formsub' actions
df_fs = df_t_all.loc[df_t_all['ACTIVITY_DATA_3'] == 'formsub']

In [None]:
df_fs = df_fs.groupby('SUPPORTER_ID', sort=False).count()

In [None]:
df1 = df1.merge(df_fs['ACTIVITY_ID'], how='left', left_on='supp_id', right_index=True)

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
# renaming some columns
df1 = df1.rename(columns={'ACTIVITY_ID': 'num_fs'})

In [None]:
df1.head()

#### Number of clicks:

In [None]:
# create new df with just transactions with a click
df_c = df_t_all.loc[df_t_all['ACTIVITY_DATA_2'] == 'click']

In [None]:
df_c = df_c.groupby('SUPPORTER_ID', sort=False).count()

In [None]:
df1 = df1.merge(df_c['ACTIVITY_ID'], how='left', left_on='supp_id', right_index=True)

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
# renaming some columns
df1 = df1.rename(columns={'ACTIVITY_ID': 'num_c'})

In [None]:
df1.head()

In [None]:
df1.shape

#### A summary of some of the findings:

There are **4935216** total transactions.

There are **1002034** supporters involved in all transactions.

There are **26971** FCS transactions total.

There are **22433** supporters who made a FCS transaction.

There are **188** FUR transactions total.
    - 187 are ACTIVITY_ID 24029 (24029 has no other ACTIVITY_TYPE)
    - 1 is ACTIVITY_ID 15348 (15348 also only has the 1 FUR ACTIVITY_TYPE)

There are **174** supporters who made a FUR transaction.

There are **21** FUR supporters who also made a FCS contribution.  

All FUR transactions are in **GBP** currency.

The following graph shows the number of FUR transactions, number of supporters who made those transactions, and the number of supporters who made both a FCS and FUR transaction.

In [None]:
actions = [188, 174, 21]
index = ['FUR_transactions', 'FUR_Unique_supporters', 'FUR_and_FCS_supporters']
df = pd.DataFrame({'actions': actions}, index=index)
ax = df.plot.bar(rot=0, figsize=(12, 6))
ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which='major', linestyle='-', linewidth='0.5', color='red')
plt.xlabel("Transactions")
plt.ylabel('Occurrences')
plt.title("FUR transactions vs FUR unique supporters vs FUR and FCS supporters")

#### Some additional analysis....

In [None]:
# display the number of unique supporter IDs from all transaction data:
df_t_all.ACTIVITY_ID.nunique()

As shown above, the total number of activity IDs for the transaction data is **1144**.  This contrasts significantly with the number of unique activity IDs for the sends broadcast data in March of **43**.

In [None]:
# This is the lone FUR supporter who does not have the 24029 ACTIVITY_ID (this person only has 1 transaction)
df_t_all.loc[df_t_all['ACTIVITY_ID'] == 15348].shape

In the cell above, just checking to see all of the transactions for activity ID 15348.  This was the one activity ID that had a FUR transaction that was not activity ID number 24029.

In [None]:
# show those supporters who made a single donation and those who made a rucurring donation as well (fur = 1)
df1.loc[df1['fur'] == 1]

Something to note in the above dataframe:  the total transactions for all of these supporters is a high number.

In [None]:
df1.total_trans.describe()

In [None]:
df1.loc[df1['fur'] == 1].total_trans.describe()

As shown above, the mean number of transactions for all FCS supporters is **14.5**, whereas the mean number of FCS supporters who also made a recurring donation (FUR) is **21.1**.

# Conclusion - Part 1

I wanted to use the dataframe below to perform machine learning / predictive analytics to try to find correlations between supporters who made FCS donations and those who made FUR donations.

However, there are only **21** supporters who made a FCS and FUR donation (from Cell number 15 above).  This is 21 supporters out of a total of **22433** (from cell 17) who made a FCS transaction.  This seems like too little data to run a ML algorithm.

Regardless, the framework / code is here to add to the dataframe below by creating more columns as needed.  See the dataframe below: 

In [None]:
df1.head(15)

The above dataframe has all unique supporters who made a FCS transaction.  The columns are as follows:

  - supp_id = SUPPORTER_ID who made a FCS transaction
  - num_fcs = number of FCS transactions total
  - fur = if supporter made at least 1 FUR transaction, a 1 is placed, otherwise 0
  - max_fcs = maximum FCS contribution amount in USD
  - total_fcs = total FCS contribution amount in USD
  - total_trans = total number of transactions made by that supporter
  - currency = currency of donation
  - num_p = number of 'P' transactions by the supporter
  - num_fs = number of 'formsub' transactions by the supporter
  - num_c = number of 'click' transactions by the supporter

# Machine Learning - predict a FUR transaction given all FCS supporter data  

One major caveat:  the FUR 1 or 0 does not consider if the FUR transaction occurred before or after the FCS transaction.

#### Logistic Regression:

In [None]:
df1 = df1.fillna(0)

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
# find the number of fur's to drop, so that the number of fur's and fcs's are equal
n = df1.shape[0] 

In [None]:
# number of rows where fur equals 1
m = df1.loc[df1['fur'] == 1].shape[0]

In [None]:
# this is the number of rows that needs to be subtracted from df1 so that FURs = FCSs
number = n - m - m

In [None]:
print(number)

In [None]:
# drop the correct amount of fur's so the number of FURs = FCSs.
df1 = df1.drop(df1.query('fur == 0').sample(number).index)

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
# drop currency for now
df1.drop(columns=['currency'], inplace=True)
df1.drop(columns=['supp_id'], inplace=True)

In [None]:
# Create training and test sets.
offset = int(df1.shape[0] * 0.8)

df_train = df1[:offset]
df_test = df1[offset:]

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
# 1.  Logistic Regression

start_time = datetime.now()

# Instantiate our model.
regr = linear_model.LogisticRegression(solver='sag')

# set features and dependent variable for training data
y_train = df_train['fur'].values

# drop the 'target' column to obtain the feature inputs
df_train.drop(['fur'], axis=1, inplace=True)

# normalize the training data
x_train = sklearn.preprocessing.normalize(df_train)

# now for test...
y_test = df_test['fur'].values

# drop the 'target' column to obtain the feature inputs
df_test.drop(['fur'], axis=1, inplace=True)

# normalize the test data
x_test = sklearn.preprocessing.normalize(df_test)

# fit model to training data
regr.fit(x_train, y_train)

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

In [None]:
# create a variable that is the 1 or 0 prediction from the model 
y_test_pred = regr.predict(x_test)
sklearn.metrics.roc_auc_score(y_test, y_test_pred)

In [None]:
# create the Confusion Matrix
sklearn.metrics.confusion_matrix(y_test, y_test_pred, labels=None, sample_weight=None)

In [None]:
# determine the accuracy classification score
sklearn.metrics.accuracy_score(y_test, y_test_pred)

#### Extra Trees Classifier

In [None]:
# 2.  Extra Trees Classifier

start_time = datetime.now()

from sklearn.ensemble import ExtraTreesClassifier

# Instantiate our model.
etc = ExtraTreesClassifier(n_estimators=1000)

# fit model to data
etc.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix
sklearn.metrics.confusion_matrix(y_test, y_test_pred, labels=None, sample_weight=None)

end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

In [None]:
y_test_pred = etc.predict(x_test)
sklearn.metrics.roc_auc_score(y_test, y_test_pred)

In [None]:
sklearn.metrics.confusion_matrix(y_test, y_test_pred, labels=None, sample_weight=None)

In [None]:
sklearn.metrics.accuracy_score(y_test, y_test_pred)

#### Random Forest Classifier

In [None]:
# 3.  Random Forest Classifier

start_time = datetime.now()

from sklearn.ensemble import RandomForestClassifier

# Instantiate our model.
rfc = RandomForestClassifier(n_estimators=1000)

# fit model to data
rfc.fit(x_train, y_train)

#------------------------------------------------------------------------------
sfm = SelectFromModel(rfc, threshold=0.001, max_features=7)
sfm.fit(x_train, y_train)
sfm.get_support()
selected_feat = df_train.columns[(sfm.get_support())]
len(selected_feat)
print(selected_feat)
#------------------------------------------------------------------------------

feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = df_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)


end_time = datetime.now()
print('\nDuration: {}'.format(end_time - start_time))

In [None]:
# using 'feature_importances_' from random forest
# these are the most important features and their relative importance for making the prediction
print((feature_importances).head(10))

In [None]:
y_test_pred = rfc.predict(x_test)
sklearn.metrics.roc_auc_score(y_test, y_test_pred)

In [None]:
sklearn.metrics.confusion_matrix(y_test, y_test_pred, labels=None, sample_weight=None)

In [None]:
sklearn.metrics.accuracy_score(y_test, y_test_pred)

# Conclusion - Part 2

So, I did end up doing some predictive modeling as shown above despite the sparse amount of supporters who made a single AND recurring contribution.  
The results can very significantly becasue each time the models are executed they are chosing only a small subset of the total data.  However, the accuracy measures have been in the range of **55 - 92%**.  This is just a starting point.  Two different scores were analyzed:  area under the roc curve and the accuracy classification score based on the Confusion Matrix.  

This accuracy represents the ability of each model to predict whether a supporter will make a recurring (FUR) donation based on the 7 featurers (columns in the final df1 dataframe) that were collected from the data.