In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# load in the tsv
data = pd.read_csv('../df_cleaned.tsv', sep='\t', header=0)

In [3]:
# select the necessary columns
job_amount = data[['major_categories', 'loan_amnt']]

In [4]:
# drop major_categories = 0, where the model was not able to identify the category 
job_amount.drop(job_amount[job_amount['major_categories'] == 0].index, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [5]:
# subtract 1 from every value in major_categories to match the sankey representation
job_amount['major_categories'] = job_amount['major_categories'] - 1 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  job_amount['major_categories'] = job_amount['major_categories'] - 1


In [6]:
# document the labels with their index (for clarity only)
labels_index = {
    'Managers': 0,
    'Professionals': 1,
    'Technicial and Associate Professionals': 2,
    'Clerical Support Workers': 3,
    'Service/Sale Workers and Customer Service': 4,
    'Craft and Related Trade Workers': 5,
    'Plant/Machine Operators and Assemblers': 6,
    'Elementary Occupations': 7,
    'Armed Forces and Corrections/Police/Security Occupations': 8,   
    'Loan Amount Above Median': 9,
    'Loan Amount Below Median': 10
}

labels = list(labels_index.keys())

In [7]:
# create the lists to be used in the sankey diagram

# source: add two values of each labor category, one will connect to loan_amount > median and the other for loan_amount <= median
source = [x for x in range(9)] * 2
source.sort()

# let 9 represent loan_amount > median, 10 represent loan_amount <= median
# since there are two flows from each labor category, and their indexes are assigned sequentially
target = [9, 10] * 9

# for each labor category, find the number of columns with for > and <= and append them to create the width of each flow
value = []
MEDIAN = 13500

for category in range(9):
    # select all records with this labor category
    loans_in_category = job_amount[job_amount['major_categories'] == category]
    
    # from all the loans of this category, find the number of loans above the median
    loans_above_median = loans_in_category[loans_in_category['loan_amnt'] > MEDIAN]
    
    # calculate and append the values
    total = len(loans_in_category)
    above = len(loans_above_median)
    below = total - above
    
    value.append(above)
    value.append(below) 

In [11]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      #color = "blue"
    ),
    link = dict(
      source = source,
      target = target,
      value = value
  ))])

fig.update_layout(title_text="Loan Amount Distribution by Labor Type", font_size=10)
fig.show()