# Data Cleaning & Feature Engineering
## Converting customer records to time series dataset

Customer record data records stagnant customer variables like age, open date, closed date etc.  The following notebook shows how these variables can be converted to dynamic time series features for machine learning and deep learning.

In [2]:
import numpy as np
import pandas as pd
from pandas import read_csv
import datetime as dt
from pandas.core.common import flatten

In [None]:
# Calculate customers' tenure in months
# A function calculating the time gap between 2 dates in terms of months.
def months(d1, d2):
    return d1.month - d2.month + 12*(d1.year - d2.year)

# Calculate tenure month in Open and Closed accounts.
diff = []
d2 = dt.datetime(2019, 7, 31) #last day of the transaction data
for i in range (len(S1)):
    if S1.iloc[i, 1] == 'Open':
        time_diff = months(d2, pd.to_datetime(S1.iloc[i, 2]))

    else:
        time_diff = months(pd.to_datetime(S1.iloc[i, 3]), pd.to_datetime(S1.iloc[i, 2]))
    diff.append(time_diff)
S1['TenureMonth'] = diff


In [None]:
# Building time slots from open date to close date (closed accounts) or July 19 (open accounts).
# Each customer's monthly transaction is recorded in 1 row.  The number of rows for each customer
# equals tenure month.

def last_day_of_month(date):
    if date.month == 12:
        return date.replace(day=31)
    return date.replace(month=date.month+1, day=1) - dt.timedelta(days=1)

timeslot = []
for i in range (len(S2)):
    start = last_day_of_month(pd.to_datetime(S2.iloc[i, 2])) #open date
    end = pd.to_datetime('2019-07-01')
    if S2.iloc[i, 1] == 'Open':
        b = pd.date_range(start, end,
                          freq='MS').strftime("%m/%y").tolist()
    else:
        b = pd.date_range(start, pd.to_datetime(S2.iloc[i, 3]),
                          freq='MS').strftime("%m/%y").tolist() #close date
    timeslot.append(b)
timeslot = list(flatten(timeslot))


In [None]:
# Filling in each customer's time slots.
#Repeating Memberid, NoofSavingsProducts, CountofLoans (these features do not change over time)
def fill_series(df, col_idx, tenure_month_col):
    series = []
    for i in range(len(df)):
        n_steps = df.iloc[i, tenure_month_col]
        s = np.repeat(df.iloc[i, col_idx], n_steps)
        series.append(s)
    return list(flatten(series))

id_list = fill_series(S2, 0, 8)
savings_list = fill_series(S2, 6, 8)
loan_list = fill_series(S2, 7, 8)

In [None]:
# Creating customer status (0-open, 1-prechurn, 2-churn), reflecting change over time.
status = []
for i in range (len(S2)):
    n_steps = S2.iloc[i, 8]
    if S2.iloc[i, 1] == 'Open':
        s = np.repeat('0', n_steps)
    else:
        a = np.repeat('0', n_steps-2)
        b = np.array([1, 2])
        s = np.concatenate((a, b))
    status.append(s)
status = list(flatten(status))

In [None]:
#Filling in age, add 1 to age after 12 months
age = []
for i in range (len(S2)):
    n_steps = S2.iloc[i, 8]
    if n_steps <= 12:
        s = np.repeat(S2.iloc[i, 4], n_steps)
    elif n_steps > 24:
        a = np.repeat(S2.iloc[i, 4], 12)
        b = np.repeat(S2.iloc[i, 4]+1, 12)
        c = np.repeat(S2.iloc[i, 4]+2, n_steps-24)
        s = np.concatenate((a, b, c))
    else:
        a = np.repeat(S2.iloc[i, 4], 12)
        b = np.repeat(S2.iloc[i, 4] + 1, n_steps-12)
        s = np.concatenate((a, b))
    age.append(s)
age = list(flatten(age))


In [None]:
#Filling in tenure month
tenure = []
for i in range (len(S2)):
    n_steps = S2.iloc[i, 8]
    s = list(range(1, n_steps + 1))
    tenure.append(s)
tenure = list(flatten(tenure))

In [None]:
#Combining all features to form a time series dataset
S3 = pd.DataFrame(np.column_stack([id_list, timeslot, status, age, tenure, savings_list, loan_list]),
                  columns=['Memberid', 'Date', 'Status', 'Age', 'TenureMonth', 'NoOfSavingsP', 'NoOfLoans'])