In [None]:
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split



In [None]:
dataframe = pd.read_csv('big_data_added_columns.csv')

In [None]:
def convert_tuple_to_string(tup):
    str = ''
    for item in range(len(tup)):
        str = str + tup[item]
        if item < len(tup) - 1:
            str = str + '_'
    return str
    


In [None]:
products = ['relay', 'money', 'invest', 'loan', 'credit']

#permute the products
product_permutations_5 = list(itertools.permutations(products, 5))
product_permutations_4 = list(itertools.permutations(products, 4))
product_permutations_3 = list(itertools.permutations(products, 3))
product_permutations_2 = list(itertools.permutations(products, 2))
product_permutations_1 = list(itertools.permutations(products, 1))

product_permutations = product_permutations_5 + product_permutations_4 + product_permutations_3 + product_permutations_2 + product_permutations_1

#create a dataframe that will have 5 columns: lvl1, lvl2, lvl3, lvl4, and count
product_permutations_df = pd.DataFrame(columns=['lvl1', 'lvl2', 'lvl3', 'lvl4', 'lvl5', 'count'])
#fill the dataframe with the product permutations
for i in range(len(product_permutations)):
    for j in range(len(product_permutations[i])):
        product_permutations_df.loc[i, f'lvl{j+1}'] = convert_tuple_to_string(product_permutations[i][:j+1])

product_permutations_df['count'] = 0

product_permutations_df

In [None]:
#last populated column is the value held in the last column that has a value in it, with the options being lvl1, lvl2, lvl3, and lvl4
# product_permutations_df['last_populated_column'] = product_permutations_df['lvl4'].fillna(product_permutations_df['lvl3'].fillna(product_permutations_df['lvl2'].fillna(product_permutations_df['lvl1'])))
product_permutations_df['last_populated_column'] = product_permutations_df['lvl5'].fillna(product_permutations_df['lvl4'].fillna(product_permutations_df['lvl3'].fillna(product_permutations_df['lvl2'].fillna(product_permutations_df['lvl1']))))
last_populated_columns_dict = product_permutations_df.set_index('last_populated_column')['count'].to_dict()

In [None]:
#go through the dataframe and create the product permutations for each person
# the permutation is the order in which they used the products
# for example, a user with data like this:
# days_with_relay: 100
# days_with_money: NaN
# days_with_invest: 300
# days_with_loan: NaN
# would have a permutation of 'invest_relay' because they used invest first and then relay

for i in tqdm(range(len(dataframe))):
    #create a list of the products that the user has
    products = []
    if dataframe.loc[i, 'uses_relay'] == True:
        products.append('relay')
    if dataframe.loc[i, 'uses_money'] == True:
        products.append('money')
    if dataframe.loc[i, 'uses_invest'] == True:
        products.append('invest')
    if dataframe.loc[i, 'uses_loan'] == True:
        products.append('loan')
    if dataframe.loc[i, 'uses_credit'] == True:
        products.append('credit')
        
    #sort the list of products by the number of days the user has used the product
    products.sort(key=lambda x: dataframe.loc[i, f'days_with_{x}'], reverse=True)
    #show the row, then show the products

    #convert the list of products to a string
    product_permutation = convert_tuple_to_string(tuple(products))
    #increment the count of the product permutation
    if len(products) > 0: 
        last_populated_columns_dict[product_permutation] += 1


In [None]:
#populate the dataframe with the counts
for i in range(len(product_permutations_df)):
    product_permutations_df.loc[i, 'count'] = last_populated_columns_dict[product_permutations_df.loc[i, 'last_populated_column']]

#drop the last_populated_column column
product_permutations_df = product_permutations_df.drop(columns=['last_populated_column'])


In [None]:
product_permutations_df['lvl0'] = 'all'
#reindex the dataframe so the columns are in the correct order
product_permutations_df = product_permutations_df.reindex(columns=['lvl0', 'lvl1', 'lvl2', 'lvl3', 'lvl4', 'count'])

In [None]:
#save product_permutations_df to a csv file
product_permutations_df.to_csv('product_permutations.csv')


In [None]:
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

In [None]:
fig = genSankey(product_permutations_df,cat_cols=['lvl0','lvl1','lvl2','lvl3'],value_cols='count',title='xbuy visualized')
plotly.offline.plot(fig, validate=False)