In [49]:
import pandas as pd
import matplotlib.ticker as ticker
from scipy.signal import find_peaks
from statsmodels.tsa.stattools import acf
from datetime import timedelta
import numpy as np

In [12]:
inflows = pd.read_parquet('../../data/ucsd-inflows.pqt')

In [28]:
inflows.category = inflows.category.apply(lambda x: 'PAYCHECK' if x == 'PAYCHECK_PLACEHOLDER' else x)

In [29]:
inflows

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.00,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-07-29,EXTERNAL_TRANSFER
...,...,...,...,...,...,...
513110,5941,acc_9524,EXTERNAL_TRANSFER,8.66,2023-01-21,EXTERNAL_TRANSFER
513111,5941,acc_9524,EXTERNAL_TRANSFER,267.13,2023-01-23,EXTERNAL_TRANSFER
513112,5941,acc_9524,EXTERNAL_TRANSFER,2.00,2023-01-24,EXTERNAL_TRANSFER
513113,5941,acc_9524,EXTERNAL_TRANSFER,207.16,2023-01-24,EXTERNAL_TRANSFER


In [30]:
#Transactions per Consumer
transactions_per_consumer = inflows.groupby('prism_consumer_id').size().reset_index(name='transactions_count')
transactions_per_consumer

Unnamed: 0,prism_consumer_id,transactions_count
0,0,91
1,2,113
2,4,152
3,7,129
4,9,375
...,...,...
2969,5930,210
2970,5935,83
2971,5939,91
2972,5940,440


In [31]:
# Sum of Inflows per Consumer
inflows_per_consumer = inflows[inflows['amount'] > 0].groupby('prism_consumer_id')['amount'].sum().reset_index(name='total_inflows')
inflows_per_consumer

Unnamed: 0,prism_consumer_id,total_inflows
0,0,110312.43
1,2,349639.03
2,4,462557.90
3,7,250214.09
4,9,215342.01
...,...,...
2969,5930,127177.11
2970,5935,13640.13
2971,5939,33882.65
2972,5940,67452.46


In [32]:
# Number of Unique Accounts per Consumer
unique_accounts_per_consumer = inflows.groupby('prism_consumer_id')['prism_account_id'].nunique().reset_index(name='unique_accounts')
unique_accounts_per_consumer

Unnamed: 0,prism_consumer_id,unique_accounts
0,0,2
1,2,1
2,4,1
3,7,2
4,9,7
...,...,...
2969,5930,1
2970,5935,1
2971,5939,1
2972,5940,1


In [33]:
# Merging all consumer-level statistics
consumer_stats = transactions_per_consumer.merge(inflows_per_consumer, on='prism_consumer_id')
consumer_stats = consumer_stats.merge(unique_accounts_per_consumer, on='prism_consumer_id')
consumer_stats

Unnamed: 0,prism_consumer_id,transactions_count,total_inflows,unique_accounts
0,0,91,110312.43,2
1,2,113,349639.03,1
2,4,152,462557.90,1
3,7,129,250214.09,2
4,9,375,215342.01,7
...,...,...,...,...
2969,5930,210,127177.11,1
2970,5935,83,13640.13,1
2971,5939,91,33882.65,1
2972,5940,440,67452.46,1


In [34]:
# Major Sources of Income
# Group by category and calculate total inflows and transaction counts
major_income_sources = (
    inflows[inflows['amount'] > 0]
    .groupby('category')
    .agg(total_inflows=('amount', 'sum'), transaction_count=('amount', 'size'))
    .reset_index()
    .sort_values(by='total_inflows', ascending=False)
)

major_income_sources

Unnamed: 0,category,total_inflows,transaction_count
7,PAYCHECK,89724682.02,59225
1,EXTERNAL_TRANSFER,81586232.14,156533
0,DEPOSIT,66602343.6,61345
5,MISCELLANEOUS,52594312.59,55648
9,SELF_TRANSFER,48301522.92,110437
6,OTHER_BENEFITS,12175837.46,7708
3,INVESTMENT_INCOME,6887278.35,17325
11,TAX,5619601.78,3405
8,REFUND,5406440.27,23220
4,LOAN,4231710.73,2513


In [20]:
plt.figure(figsize=(10, 6))
sns.barplot(
    data=major_income_sources,
    x='total_inflows', 
    y='category',       
    hue ='category'      
)

plt.title('Major Sources of Income by Category', fontsize=16)
plt.xlabel('Total Inflows ($)', fontsize=12)
plt.ylabel('Category', fontsize=12)

ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()


NameError: name 'plt' is not defined

In [43]:
from collections import defaultdict

def analyze_recurring_inflows(data, consumer_id, category=None, tolerance=0.2, frequency_tolerances=None):
    """
    Identify recurring inflows for a specific consumer based on timing and amount.

    Parameters:
    - data (pd.DataFrame): A dataset with columns ['prism_consumer_id', 'amount', 'posted_date', 'category', 'memo'].
    - consumer_id (int): Consumer ID to filter.
    - category (str): Optional. Filter by specific category.
    - tolerance (float): Tolerance for amount variability (e.g., 0.2 = 20% variation).
    - frequency_tolerances (dict): Tolerance ranges for frequencies, e.g. {'weekly': (6, 8), 'monthly': (27, 34)}.

    Returns:
    - dict: A dictionary of recurring inflows grouped by memo, containing frequency and average amount.
    """
    # Default frequency tolerances
    if frequency_tolerances is None:
        frequency_tolerances = {
            'weekly': (6, 8),
            'biweekly': (12, 16),
            'monthly': (27, 34)
        }

    # Filter dataset
    consumer_data = data[data['prism_consumer_id'] == consumer_id]
    if category:
        consumer_data = consumer_data[consumer_data['category'] == category]

    # Convert dates to datetime
    consumer_data['posted_date'] = pd.to_datetime(consumer_data['posted_date'])
    consumer_data = consumer_data.sort_values(by='posted_date')

    # Group by memo
    recurring_results = defaultdict(list)

    for memo, group in consumer_data.groupby('memo'):
        group = group.sort_values(by='posted_date').reset_index(drop=True)

        # Calculate time differences
        group['time_diff'] = group['posted_date'].diff().dt.days

        for freq, (lower, upper) in frequency_tolerances.items():
            # Identify rows matching the frequency range
            freq_group = group[(group['time_diff'] >= lower) & (group['time_diff'] <= upper)]

            if not freq_group.empty:
                # Check for amount consistency
                avg_amount = freq_group['amount'].mean()
                consistent_amounts = freq_group['amount'].apply(
                    lambda x: abs(x - avg_amount) <= tolerance * avg_amount
                ).all()

                if consistent_amounts:
                    recurring_results[memo].append({
                        'frequency': freq,
                        'avg_amount': round(avg_amount, 2),
                        'occurrences': len(freq_group),
                        'transactions': freq_group
                    })

    return recurring_results


consumer_id = 100  # Replace with a sample consumer ID from your data
filtered_recurring = analyze_recurring_inflows(
    inflows, consumer_id, category='PAYCHECK', tolerance=0.15
)

for memo, results in filtered_recurring.items():
    print(f"Recurring Inflows for Memo: {memo}")
    for result in results:
        print(f"  Frequency: {result['frequency']}, Avg Amount: {result['avg_amount']}, Occurrences: {result['occurrences']}")
        display(result['transactions']) 


Recurring Inflows for Memo: PAYCHECK
  Frequency: monthly, Avg Amount: 166.03, Occurrences: 1


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,time_diff
14,100,acc_284,PAYCHECK,166.03,2021-01-14,PAYCHECK,28.0


In [59]:
def find_regular_inflows(data, consumer_id, category="PAYCHECK", amount_tol=0.2, time_tol=5, lookback_days=365):
    """
    Identifies regular inflows based on both amount and timing consistency.
    
    Parameters:
    - data (pd.DataFrame): The input dataset.
    - consumer_id (int): The ID of the consumer to analyze.
    - category (str): The inflow category (e.g., PAYCHECK, DEPOSIT).
    - amount_tol (float): Tolerance for amount regularity (e.g., 0.2 for 20%).
    - time_tol (int): Tolerance for time regularity in days (e.g., 5 days).
    - lookback_days (int): How far back to look for recurring patterns.
    
    Returns:
    - pd.DataFrame: DataFrame with recurring inflows based on both amount and time regularity.
    """
    # Step 1: Filter the dataset for the consumer and category
    consumer_data = data[(data['prism_consumer_id'] == consumer_id) & (data['category'] == category)].copy()
    consumer_data['posted_date'] = pd.to_datetime(consumer_data['posted_date'])
    consumer_data = consumer_data.sort_values(by='posted_date')
    
    # Step 2: Filter by the lookback period
    cutoff_date = consumer_data['posted_date'].max() - pd.Timedelta(days=lookback_days)
    consumer_data = consumer_data[consumer_data['posted_date'] >= cutoff_date]
    
    if consumer_data.empty:
        return pd.DataFrame(), "No data found for the given consumer and category."
    
    # Step 3: Find Amount Clusters
    amount_buckets = {}
    checks = set()
    
    for _, row in consumer_data.iterrows():
        curr_amount = row['amount']
        grouped = False
        
        # Check if the current amount fits into any existing amount group
        for check in list(checks):
            avg_amount = amount_buckets[check]['amount'].mean()
            if abs(curr_amount - avg_amount) / avg_amount <= amount_tol:
                amount_buckets[check] = pd.concat([amount_buckets[check], pd.DataFrame([row])], ignore_index=True)
                grouped = True
                break
        
        if not grouped:
            amount_buckets[curr_amount] = pd.DataFrame([row])
            checks.add(curr_amount)
    
    # Step 4: Find Timing Regularity
    recurring_inflows = []
    
    for amount_group, group in amount_buckets.items():
        # Calculate the time differences between transactions
        group['time_diff'] = group['posted_date'].diff().dt.days
        group = group.dropna(subset=['time_diff'])
        
        # Identify recurring time patterns based on a defined tolerance
        for i in range(1, len(group)):
            time_diff = group['time_diff'].iloc[i]
            if abs(time_diff - time_tol) <= time_tol:
                recurring_inflows.append({
                    'memo': group['memo'].iloc[0],
                    'avg_amount': round(amount_group, 2),
                    'time_diff': time_diff,
                    'count': len(group)
                })
                break
    
    # Step 5: Return the result
    if recurring_inflows:
        return pd.DataFrame(recurring_inflows), "Recurring inflows detected."
    else:
        return pd.DataFrame(), "No recurring inflows found."

# Example usage
regular_inflows, message = find_regular_inflows(inflows, consumer_id=2967, category="PAYCHECK", amount_tol=0.2, time_tol=14, lookback_days=365)
print(message)
print(regular_inflows)


Recurring inflows detected.
                   memo  avg_amount  time_diff  count
0  PAYCHECK_PLACEHOLDER     1324.84       14.0      9


In [53]:
filtered_data = inflows[
    (inflows['prism_consumer_id'] == consumer_id) &
    (inflows['category'] == category)
]
print(f"Filtered Data Shape: {filtered_data.shape}")
print(filtered_data.head())

Filtered Data Shape: (20, 6)
        prism_consumer_id prism_account_id                  memo   amount  \
256208               2967         acc_6550  PAYCHECK_PLACEHOLDER  1090.51   
256209               2967         acc_6550  PAYCHECK_PLACEHOLDER  1153.25   
256210               2967         acc_6550  PAYCHECK_PLACEHOLDER  1517.45   
256211               2967         acc_6550  PAYCHECK_PLACEHOLDER  1400.02   
256217               2967         acc_6550  PAYCHECK_PLACEHOLDER   137.51   

       posted_date  category  
256208  2022-07-08  PAYCHECK  
256209  2022-08-19  PAYCHECK  
256210  2022-10-14  PAYCHECK  
256211  2022-06-10  PAYCHECK  
256217  2022-06-27  PAYCHECK  


In [60]:
def find_regular_inflows(data, consumer_id, amount_tol=0.2, time_tol=7, min_occurrences=3):
    """
    Identifies recurring inflows for a specific consumer by checking the amount and time regularity.

    Parameters:
    - data (pd.DataFrame): The dataset containing consumer inflows.
    - consumer_id (int): The ID of the consumer to analyze.
    - amount_tol (float): Tolerance for amount regularity (e.g., 0.2 for 20%).
    - time_tol (int): Tolerance for time regularity in days.
    - min_occurrences (int): Minimum number of occurrences to qualify as regular inflow.

    Returns:
    - pd.DataFrame: DataFrame with recurring inflows.
    - str: Message indicating the result.
    """
    # Step 1: Filter dataset for the specific consumer
    consumer_data = data[data['prism_consumer_id'] == consumer_id].copy()
    consumer_data['posted_date'] = pd.to_datetime(consumer_data['posted_date'])
    consumer_data = consumer_data.sort_values('posted_date')
    
    if consumer_data.empty:
        return pd.DataFrame(), "No data found for the given consumer."
    
    # Step 2: Group by memo (category) and find recurring inflows
    recurring_inflows = []
    
    for memo, group in consumer_data.groupby('memo'):
        # Calculate time differences between consecutive transactions
        group['time_diff'] = group['posted_date'].diff().dt.days
        
        # Step 3: Check for time and amount regularity
        valid_times = group[(group['time_diff'] <= time_tol) | group['time_diff'].isna()]
        
        if len(valid_times) >= min_occurrences:
            avg_amount = valid_times['amount'].mean()
            
            # Step 4: Check if amounts fall within the tolerance range
            within_amount_tol = valid_times['amount'].apply(
                lambda x: abs(x - avg_amount) / avg_amount <= amount_tol
            ).all()
            
            if within_amount_tol:
                recurring_inflows.append({
                    'memo': memo,
                    'avg_amount': round(avg_amount, 2),
                    'occurrences': len(valid_times)
                })
                
    # Step 5: Return the results
    if recurring_inflows:
        return pd.DataFrame(recurring_inflows), "Recurring inflows detected."
    else:
        return pd.DataFrame(), "No recurring inflows found."

# Example usage
regular_inflows, message = find_regular_inflows(inflows, consumer_id=2967, amount_tol=0.2, time_tol=14, min_occurrences=3)
print(message)
print(regular_inflows)


No recurring inflows found.
Empty DataFrame
Columns: []
Index: []


In [62]:
set(inflows['category'])

{'DEPOSIT',
 'EXTERNAL_TRANSFER',
 'INSURANCE',
 'INVESTMENT_INCOME',
 'LOAN',
 'MISCELLANEOUS',
 'OTHER_BENEFITS',
 'PAYCHECK',
 'REFUND',
 'SELF_TRANSFER',
 'SMALL_DOLLAR_ADVANCE',
 'TAX',
 'UNEMPLOYMENT_BENEFITS'}

In [127]:
def get_regular_income_for_consumer(data, consumer_id, threshold_percentage=0.05):
    """
    This function returns the regular income details for a specific consumer, showing individual transactions.
    
    Args:
        data (DataFrame): The dataset containing transaction data.
        consumer_id (str): The consumer ID to search for.
        threshold_percentage (float): The percentage threshold for grouping similar amounts (default is 5%).
    
    Returns:
        DataFrame: A DataFrame containing the regular income information for the specified consumer, sorted by category and amount.
    """
    # Define income categories of interest
    income_categories = ['PAYCHECK', 'EXTERNAL_TRANSFER', 'INVESTMENT_INCOME', 'DEPOSIT',
                         'OTHER_BENEFITS', 'UNEMPLOYMENT_BENEFITS']

    # Filter rows that correspond to regular income categories
    income_data = data[data['category'].isin(income_categories)]

    # Convert 'posted_date' to datetime to enable date-based analysis
    income_data['posted_date'] = pd.to_datetime(income_data['posted_date'])

    # Filter data for the specific consumer
    consumer_data = income_data[income_data['prism_consumer_id'] == consumer_id]

    # If consumer data is empty, return a message
    if consumer_data.empty:
        return f"No data found for consumer ID: {consumer_id}"

    # Normalize amounts by applying a threshold (±5% of the amount)
    def round_to_threshold(amount, threshold_percentage):
        """
        Rounds the amount to the nearest multiple defined by the threshold percentage.
        """
        return round(amount / (1 + threshold_percentage)) * (1 + threshold_percentage)

    consumer_data['amount_grouped'] = consumer_data['amount'].apply(
        lambda x: round_to_threshold(x, threshold_percentage)
    )

    # Group by 'category' and 'amount_grouped', counting how often each amount occurs for the consumer
    regular_incomes_by_amount = consumer_data.groupby(['prism_consumer_id', 'category', 'amount_grouped'])\
                                              .size().reset_index(name='count')

    # Filter for incomes that appear more than once (indicating regular income)
    regular_incomes = regular_incomes_by_amount[regular_incomes_by_amount['count'] > 1]

    # Merge the regular income back into the consumer data to get full transaction details
    regular_income_details = pd.merge(consumer_data, regular_incomes[['prism_consumer_id', 'category', 'amount_grouped']],
                                      on=['prism_consumer_id', 'category', 'amount_grouped'], how='inner')

    # Return the relevant columns (prism_consumer_id, amount, posted_date, category)
    regular_income_details = regular_income_details[['prism_consumer_id', 'amount', 'posted_date', 'category']]

    # Sort the data by category and amount for better organization
    regular_income_details = regular_income_details.sort_values(by=['category', 'amount'])

    return regular_income_details

consumer_id = 0
regular_income_for_consumer = get_regular_income_for_consumer(inflows, consumer_id, threshold_percentage=0.05)

regular_income_for_consumer


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_data['posted_date'] = pd.to_datetime(income_data['posted_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  consumer_data['amount_grouped'] = consumer_data['amount'].apply(


Unnamed: 0,prism_consumer_id,amount,posted_date,category
5,0,75.0,2022-03-23,EXTERNAL_TRANSFER
14,0,75.0,2022-02-24,EXTERNAL_TRANSFER
21,0,75.0,2022-08-29,EXTERNAL_TRANSFER
23,0,75.0,2022-05-24,EXTERNAL_TRANSFER
25,0,75.0,2022-06-23,EXTERNAL_TRANSFER
38,0,75.0,2022-09-29,EXTERNAL_TRANSFER
39,0,75.0,2022-07-19,EXTERNAL_TRANSFER
48,0,75.0,2022-10-27,EXTERNAL_TRANSFER
54,0,75.0,2022-04-18,EXTERNAL_TRANSFER
13,0,80.0,2022-12-29,EXTERNAL_TRANSFER
