## Setup

### Install Apache Beam

https://colab.research.google.com/?utm_source=scs-index

In [1]:
!{'pip install --quiet apache-beam'}
!{'mkdir -p data'}

[K     |████████████████████████████████| 10.1 MB 5.3 MB/s 
[K     |████████████████████████████████| 151 kB 26.5 MB/s 
[K     |████████████████████████████████| 255 kB 62.5 MB/s 
[K     |████████████████████████████████| 2.3 MB 57.1 MB/s 
[K     |████████████████████████████████| 45 kB 2.9 MB/s 
[K     |████████████████████████████████| 63 kB 1.3 MB/s 
[K     |████████████████████████████████| 508 kB 46.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 55.2 MB/s 
[?25h  Building wheel for dill (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.
multiprocess 0.70.12.2 requires dill>=0.3.4, but you have dill 0.3.1.1 which is incompatible.
gym 0.17.3 requires cloudpickle<1.7.0,>=1.2.0, but you have cloudpickle 2.0.0 which i

### Load Data from Google Drive

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
!ls "drive/MyDrive/Colab Notebooks/data/bank"

Bank_requirements.txt  cards.txt  defaulters  loan.txt


## Pipeline

In [51]:
import apache_beam as beam
from datetime import datetime

p = beam.Pipeline()

def calculate_points(element):

  customer_id, first_name, last_name, relationship_id, card_type, max_limit, spent, cash_withdrawn, payment_cleared, payment_date = element.split(',')

  spent = int(spent)
  payment_cleared = int(payment_cleared)
  max_limit = int(max_limit)

  key_name = customer_id + ', ' + first_name + ' ' + last_name
  defaulter_points = 0

  if payment_cleared < (spent * 0.7):
    defaulter_points += 1

  if (spent >= max_limit) and (payment_cleared < spent):
    defaulter_points += 1

  if (spent >= max_limit) and (payment_cleared < (spent * 0.7)):
    defaulter_points += 1

  return key_name, defaulter_points

def format_result(sum_pair):

  key_name, points = sum_pair

  return str(key_name) + ', ' + str(points) + ' default_points'

def calculate_late_payment(elements):
  
  due_date = datetime.strptime(elements[6].rstrip().lstrip(), '%d-%m-%Y')
  payment_date = datetime.strptime(elements[8].rstrip().lstrip(), '%d-%m-%Y')

  if payment_date <= due_date:
    elements.append('0')
  else:
    elements.append('1')

  return elements

def format_output(sum_pair):

  key_name, miss_months = sum_pair
  return str(key_name) + ', ' + str(miss_months) + ' missed'

def calculate_month(input_list):
  
  payment_date = datetime.strptime(input_list[8].rstrip().lstrip(), '%d-%m-%Y')
  input_list.append(str(payment_date.month))

  return input_list

def calculate_personal_loan_defaulter(input):

  max_allowed_missed_months = 4
  max_allowed_consecutive_missing = 2

  name, months_list = input

  months_list.sort()
  sorted_months = months_list
  total_payments = len(sorted_months)

  missed_payments = 12 - total_payments

  if missed_payments > max_allowed_missed_months:
    return name, missed_payments

  consecutive_missed_months = 0

  temp = sorted_months[0] - 1

  if temp > consecutive_missed_months:
    consecutive_missed_months = temp

  temp = 12 - sorted_months[total_payments - 1]

  if temp > consecutive_missed_months:
    consecutive_missed_months = temp

  for i in range(1, len(sorted_months)):
    temp = sorted_months[i] - sorted_months[i-1] - 1
    if temp > consecutive_missed_months:
      consecutive_missed_months = temp
  
  if consecutive_missed_months > max_allowed_consecutive_missing:
    return name, consecutive_missed_months

  return name, 0

def return_tuple(element):
  thisTuple = element.split(',')
  return(thisTuple[0],thisTuple[1:])

card_defaulter = (
    p
    | "Read credit card data" >> beam.io.ReadFromText("drive/MyDrive/Colab Notebooks/data/bank/cards.txt", skip_header_lines=1)
    | "Calculate defaulter points" >> beam.Map(calculate_points)
    | "Combine points for defaulters" >> beam.CombinePerKey(sum)
    | "filter card defaulters" >> beam.Filter(lambda element: element[1] > 0)
    | "Format output" >> beam.Map(format_result)
    #| "Write credit card data" >> beam.io.WriteToText("outputs/card_skippers")
    | "Tuple Card" >> beam.Map(return_tuple)
)

medical_loan_defaulter = (
    p
    | "Read medical loan data" >> beam.io.ReadFromText("drive/MyDrive/Colab Notebooks/data/bank/loan.txt", skip_header_lines=1)
    | "Split Row" >> beam.Map(lambda row:row.split(','))
    | "Filter medical loan" >> beam.Filter(lambda element:(element[5]).rstrip().lstrip() == 'Medical Loan')
    | "Calculate late payment" >> beam.Map(calculate_late_payment)
    | "Make key value pairs" >> beam.Map(lambda elements:(elements[0] + ', ' + elements[1] + ' ' + elements[2], int(elements[9])))
    | "Group medical loan based on month" >> beam.CombinePerKey(sum)
    | "Check for medical loan defaulter" >> beam.Filter(lambda element:element[1] >= 3)
    | "Format medical loan output" >> beam.Map(format_output)
)

personal_loan_defaulter = (
    p
    | "Read personal loan data" >> beam.io.ReadFromText("drive/MyDrive/Colab Notebooks/data/bank/loan.txt", skip_header_lines=1) #CT88330,Humberto,Banks,Serviceman,LN_1559,Medical Loan,26-01-2018, 2000, 30-01-2018
    | "Split" >> beam.Map(lambda row:row.split(',')) #['CT88330', 'Humberto', 'Banks', 'Serviceman', 'LN_1559', 'Medical Loan', '26-01-2018', ' 2000', ' 30-01-2018']
    | "Filter personal loan" >> beam.Filter(lambda element:(element[5]).rstrip().lstrip() == 'Personal Loan') #['CT68554', 'Ronald', 'Chiki', 'Serviceman', 'LN_8460', 'Personal Loan', '25-01-2018', ' 50000', ' 25-01-2018']
    | "split and Append New Month column" >> beam.Map(calculate_month) #['CT68554', 'Ronald', 'Chiki', 'Serviceman', 'LN_8460', 'Personal Loan', '25-01-2018', ' 50000', ' 25-01-2018', '1']
    | "Make key value pairs loan" >> beam.Map(lambda elements: (elements[0] + ', ' + elements[1] + ' ' + elements[2], int(elements[9]))) #('CT68554, Ronald Chiki', 1)
    | "Group for personal loan defaulter" >> beam.GroupByKey() #('CT68554, Ronald Chiki', [1, 5, 6, 7, 8, 9, 10, 11, 12])
    | "Check for personal loan defaulter" >> beam.Map(calculate_personal_loan_defaulter) #('CT68554, Ronald Chiki', 3)
    | "Filter only personal loan defaulters" >> beam.Filter(lambda element: element[1] > 0) #('CT68554, Ronald Chiki', 3)
    | "Format personal loan ouput" >> beam.Map(format_output) #CT68554, Ronald Chiki, 3 missed
)

final_loan_defaulter = (
    (personal_loan_defaulter, medical_loan_defaulter)
    | "Combine all defaulters" >> beam.Flatten()
    #| "Write all defaultes to text file" >> beam.io.WriteToText('outputs/loan_defaulters')
    | "Tuple Loan" >> beam.Map(return_tuple)
)

both_defaulter = (
    {'card_defaulter': card_defaulter, 'loan_defaulter': final_loan_defaulter}
    | beam.CoGroupByKey()
    | "Write p3 results" >> beam.io.WriteToText("outputs/both")
)

p.run()

#!{'head -n 5 outputs/card_skippers-00000-of-00001'}
#!{'head -n 5 outputs/loan_defaulters-00000-of-00001'}
!{'head -n 5 outputs/both-00000-of-00001'}



('CT28383', {'card_defaulter': [[' Miyako Burns', ' 3 default_points']], 'loan_defaulter': []})
('CT74474', {'card_defaulter': [[' Nanaho Brennan', ' 3 default_points']], 'loan_defaulter': [[' Nanaho Brennan', ' 5 missed']]})
('CT66322', {'card_defaulter': [[' Chris Bruce', ' 1 default_points']], 'loan_defaulter': [[' Chris Bruce', ' 8 missed']]})
('CT65528', {'card_defaulter': [[' Bonnie Barlow', ' 2 default_points']], 'loan_defaulter': []})
('CT84463', {'card_defaulter': [[' Isaac Bowman', ' 4 default_points']], 'loan_defaulter': [[' Isaac Bowman', ' 3 missed']]})


## Loan Defaulters