In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

# Read and preprocess data

In [None]:
data_link = 'https://docs.google.com/spreadsheets/d/1VoB4dIH2Y2x2O-eH0ivNmBUYCcT-1NR6T5h8eWkE33Y/gviz/tq?tqx=out:csv&gid=1639699984'
df = pd.read_csv(data_link, index_col=0)

df['Start Date'] = pd.to_datetime(df['Start Date'])
df['End Date'] = pd.to_datetime(df['End Date'])

In [None]:
df_grouped_by_org = df.groupby(['Organization Name', 'Organization ID', 'Category', 'Start Date'], as_index=False).sum()
df_grouped_by_op = df.groupby(['Operations Code', 'Operations Description', 'Start Date'], as_index=False).sum()

In [None]:
unique_org_name = list(df['Organization Name'].unique())
id2name = {}
for org_name in unique_org_name:
    nm = org_name.split(' ( ')[0]
    org_id = org_name.split(' ( ')[-1].split()[0]
    id2name[org_id] = nm
    
    
unique_ops = df[['Operations Code', 'Operations Description']].drop_duplicates()
code2operation = {}
for _, row in unique_ops.iterrows():
    code = row['Operations Code']
    op = row['Operations Description']
    code2operation[code] = op


# Anomaly/Outlier detection

## Calc summary stats

Here, we are calculating the mean, 1Q, 3Q, IQR, and standard deviation for each organization and for each operation.
With these, we can classify outliers assuming the data is normaly distributed or using the IQR rule.

Using the first method, a point is an oulier if it is more than two standard deviations away from the average.
Using the second method, a point is an oulier if it is more than 1.5 IQR higher than the 3rd quartile or 1.5 IQR lower than the 1st quartile.

In [None]:
def calc_stats(series):
    res = {}
    res['mean'] = me =  np.mean(series)
    res['std'] = std = np.std(series)
    res['norm_low'] = me - 2*std
    res['norm_high'] = me + 2*std
    
    qs = np.quantile(series, [0.25, 0.75])

    res['1Q'] = qs[0]
    res['3Q'] = qs[1]
    res['iqr'] = iqr = qs[1]-qs[0]
    res['iqr_low'] = qs[0] - 1.5*iqr
    res['iqr_high'] = qs[1] + 1.5*iqr
    
    return res

In [None]:
id2stats = {}
code2stats = {}

for org_id, _ in id2name.items():
    org_payments = df_grouped_by_org[(df_grouped_by_org['Organization ID']==org_id)].copy()

    id2stats[org_id] = {}
    if org_payments.shape[0] > 1:
        
        id2stats[org_id]['nonzero'] = calc_stats(org_payments['Operations Amount (BGN)'])
        
        date_fill_df = pd.DataFrame({
            'Start Date': pd.date_range(org_payments['Start Date'].min(), org_payments['Start Date'].max())
        })
        org_payments = date_fill_df.merge(org_payments, on='Start Date', how='left')
        org_payments['Operations Amount (BGN)'].fillna(0, inplace=True)
        
        id2stats[org_id]['zerofilled'] = calc_stats(org_payments['Operations Amount (BGN)'])

        
for cd, org_name in code2operation.items():
    op_payments = df_grouped_by_op[(df_grouped_by_op['Operations Code']==cd)].copy()   
    code2stats[cd] = {}
    if op_payments.shape[0] > 1:
        code2stats[cd]['nonzero'] = calc_stats(op_payments['Operations Amount (BGN)'])
        
        date_fill_df = pd.DataFrame({
            'Start Date': pd.date_range(op_payments['Start Date'].min(), op_payments['Start Date'].max())
        })
        op_payments = date_fill_df.merge(op_payments, on='Start Date', how='left')
        op_payments['Operations Amount (BGN)'].fillna(0, inplace=True)
        
        code2stats[cd]['zerofilled'] = calc_stats(op_payments['Operations Amount (BGN)'])


In [None]:
def classify_payments_norm(row, stats_dict):
    payment = row['Operations Amount (BGN)']
    
    norm_nonzero = 1*(
        (payment > stats_dict['norm_high']) or 
        ((payment < stats_dict['norm_low']))
    )
    return norm_nonzero
    
    
def classify_payments_iqr(row, stats_dict):    
    payment = row['Operations Amount (BGN)']    
    iqr_nonzero = 1*(
        (payment > stats_dict['iqr_high']) or 
        ((payment < stats_dict['iqr_low']))
    )
    return iqr_nonzero


def if_org_id(row, key, fn):
    if id2stats[row['Organization ID']]=={}:
        return np.nan
    return fn(row, id2stats[row['Organization ID']][key])
    

def if_code_id(row, key, fn):
    if code2stats[row['Operations Code']]=={}:
        return np.nan
    return fn(row, code2stats[row['Operations Code']][key])

## Classify data points by Organization ID

In [None]:
df_grouped_by_org['anomaly_norm_nonzero'] = df_grouped_by_org.apply(
    lambda row: if_org_id(row, 'nonzero', classify_payments_norm),
    axis=1
)
df_grouped_by_org['anomaly_norm_zerofilled'] = df_grouped_by_org.apply(
    lambda row: if_org_id(row, 'zerofilled', classify_payments_norm),
    axis=1
)
df_grouped_by_org['anomaly_iqr_nonzero'] = df_grouped_by_org.apply(
    lambda row: if_org_id(row, 'nonzero', classify_payments_iqr),
    axis=1
)
df_grouped_by_org['anomaly_iqr_zerofilled'] = df_grouped_by_org.apply(
    lambda row: if_org_id(row, 'zerofilled', classify_payments_iqr),
    axis=1
)

np.around(100*df_grouped_by_org[
    [c for c in df_grouped_by_org.columns if "anomaly" in c]
].sum()/df_grouped_by_org.shape[0], 2)

### Plot

In [None]:
anomaly_choice_norm = 'anomaly_norm_nonzero'

for org_id, org_name in id2name.items():
    org_payments = df_grouped_by_org[
        (df_grouped_by_org['Organization ID']==org_id) &
        (df_grouped_by_org['Start Date'] >= pd.Timestamp('2021-01-01')) &
        (df_grouped_by_org['Start Date'] < pd.Timestamp('2022-01-01'))
    ].copy() 
    anomalies_norm = org_payments[org_payments[anomaly_choice_norm]>0].copy()
    
    if org_payments.shape[0] > 0:
        date_fill_df = pd.DataFrame({
            'Start Date': pd.date_range(org_payments['Start Date'].min(), org_payments['Start Date'].max())
        })
        org_payments = date_fill_df.merge(org_payments, on='Start Date', how='left')
        org_payments['Operations Amount (BGN)'].fillna(0, inplace=True)
        min_pay = org_payments['Operations Amount (BGN)'].min()
        max_pay = org_payments['Operations Amount (BGN)'].max()
        
        labels = ["Плащания", 0]
        
        plt.figure(figsize=(15, 9))
        plt.plot(org_payments['Start Date'], org_payments['Operations Amount (BGN)'], marker="o", linestyle="dashed")
        plt.vlines(anomalies_norm['Start Date'], min_pay, max_pay, colors='purple', alpha=0.5)
        plt.axhline(y=0, c='grey', alpha=0.5)
        
        if id2stats[org_id]!={}:
            plt.axhline(y=id2stats[org_id]['nonzero']['norm_high'], c='tab:blue', linestyle="--", alpha=0.5)
            plt.axhline(y=id2stats[org_id]['nonzero']['norm_low'], c='tab:blue', linestyle="--", alpha=0.5)
            labels.append('Горна граница')
            labels.append('Долна граница')
        
        
        if anomalies_norm.shape[0] > 0:
            labels.append("Аномалии")
            
        plt.legend(labels)
        plt.title(f"{org_name}")
        plt.show()

## Classify data points by Operations Code

In [None]:
df_grouped_by_op['anomaly_norm_nonzero'] = df_grouped_by_op.apply(
    lambda row: if_code_id(row, 'nonzero', classify_payments_norm),
    axis=1
)
df_grouped_by_op['anomaly_norm_zerofilled'] = df_grouped_by_op.apply(
    lambda row: if_code_id(row, 'zerofilled', classify_payments_norm),
    axis=1
)
df_grouped_by_op['anomaly_iqr_nonzero'] = df_grouped_by_op.apply(
    lambda row: if_code_id(row, 'nonzero', classify_payments_iqr),
    axis=1
)
df_grouped_by_op['anomaly_iqr_zerofilled'] = df_grouped_by_op.apply(
    lambda row: if_code_id(row, 'zerofilled', classify_payments_iqr),
    axis=1
)

np.around(100*df_grouped_by_op[
    [c for c in df_grouped_by_op.columns if "anomaly" in c]
].sum()/df_grouped_by_op.shape[0], 2)

### Plot

In [None]:
anomaly_choice_norm = 'anomaly_norm_nonzero'

for code, op in code2operation.items():
    op_payments = df_grouped_by_op[
        (df_grouped_by_op['Operations Code']==code) &
        (df_grouped_by_op['Start Date'] >= pd.Timestamp('2021-11-01')) &
        (df_grouped_by_op['Start Date'] < pd.Timestamp('2023-01-01'))
    ].copy() 
    anomalies_norm = op_payments[op_payments[anomaly_choice_norm]>0].copy()
    
    if op_payments.shape[0] > 0:
        date_fill_df = pd.DataFrame({
            'Start Date': pd.date_range(op_payments['Start Date'].min(), op_payments['Start Date'].max())
        })
        op_payments = date_fill_df.merge(op_payments, on='Start Date', how='left')
        op_payments['Operations Amount (BGN)'].fillna(0, inplace=True)
        min_pay = op_payments['Operations Amount (BGN)'].min()
        max_pay = op_payments['Operations Amount (BGN)'].max()
        
        labels=["Плащания", 0,]
        plt.figure(figsize=(15, 9))
        plt.plot(op_payments['Start Date'], op_payments['Operations Amount (BGN)'], marker="o", linestyle="dashed")
        plt.vlines(anomalies_norm['Start Date'], min_pay, max_pay, colors='purple', alpha=0.5)
        plt.axhline(y=0, c='grey', alpha=0.5)
        
        if code2stats[code]!={}:
            plt.axhline(y=code2stats[code]['nonzero']['norm_high'], c='tab:blue', linestyle="--", alpha=0.5)
            plt.axhline(y=code2stats[code]['nonzero']['norm_low'], c='tab:blue', linestyle="--", alpha=0.5)
            labels.append('Горна граница')
            labels.append('Долна граница')
        
        if anomalies_norm.shape[0] > 0:
            labels.append('Аномалии')
            
        plt.legend(labels)
        plt.title(f"{op} {code}")
        plt.show()

----------------------------------------------------------------------------------------------------------------------------
# Investigate EoY big spendings

In [None]:
df_by_org = df_grouped_by_org.copy()
df_by_org['month'] = df_by_org['Start Date'].dt.month
df_by_org['year'] = df_by_org['Start Date'].dt.year
df_by_org['week'] = df_by_org['Start Date'].dt.week
# df_by_org.head()

In [None]:
df_by_org_weekly = df_by_org.groupby(
    ['Organization Name', 'Organization ID', 'Category', 'year', 'week']
)['Operations Amount (BGN)'].sum().reset_index().rename(
    columns={'Operations Amount (BGN)': 'spending_BGN'}
)

df_by_org_monthly = df_by_org.groupby(
    ['Organization Name', 'Organization ID', 'Category', 'year', 'month']
)['Operations Amount (BGN)'].sum().reset_index().rename(
    columns={'Operations Amount (BGN)': 'spending_BGN'}
)
# df_by_org_monthly

In [None]:
# Weekly avgs
weekly_avg_yearly = df_by_org_weekly.groupby(
    ['Organization Name', 'Organization ID', 'Category', 'year']
)['spending_BGN'].mean().reset_index().rename(
    columns={'spending_BGN': 'weekly_avg_yearly'}
)

weekly_avg_overall = df_by_org_weekly.groupby(
    ['Organization Name', 'Organization ID', 'Category']
)['spending_BGN'].mean().reset_index().rename(
    columns={'Operations Amount (BGN)': 'weekly_avg_overall'}
)

weekly_avg_over_years = df_by_org_weekly.groupby(
    ['Organization Name', 'Organization ID', 'Category', 'week']
)['spending_BGN'].mean().reset_index().rename(
    columns={'spending_BGN': 'weekly_avg_over_years'}
)


# Monthly avgs
monthly_avg_yearly = df_by_org_monthly.groupby(
    ['Organization Name', 'Organization ID', 'Category', 'year']
)['spending_BGN'].mean().reset_index().rename(
    columns={'spending_BGN': 'monthly_avg_yearly'}
)

monthly_avg_overall = df_by_org_monthly.groupby(
    ['Organization Name', 'Organization ID', 'Category']
)['spending_BGN'].mean().reset_index().rename(
    columns={'spending_BGN': 'monthly_avg_overall'}
)

monthly_avg_over_years = df_by_org_monthly.groupby(
    ['Organization Name', 'Organization ID', 'Category', 'month']
)['spending_BGN'].sum().reset_index().rename(
    columns={'spending_BGN': 'monthly_avg_over_years'}
)


In [None]:
df_by_org_monthly = df_by_org_monthly.merge(
    monthly_avg_yearly, 
    on=['Organization Name', 'Organization ID', 'Category', 'year']
)
df_by_org_monthly['deviation_from_monthly_avg_yearly'] = np.around(
    df_by_org_monthly.spending_BGN - df_by_org_monthly.monthly_avg_yearly, 2
)


df_by_org_monthly = df_by_org_monthly.merge(
    monthly_avg_over_years, 
    on=['Organization Name', 'Organization ID', 'Category', 'month']
)
df_by_org_monthly['deviation_from_monthly_avg_over_years'] = np.around(
    df_by_org_monthly.spending_BGN - df_by_org_monthly.monthly_avg_over_years, 2
)


df_by_org_monthly = df_by_org_monthly.merge(
    monthly_avg_overall, 
    on=['Organization Name', 'Organization ID', 'Category']
)
df_by_org_monthly['deviation_from_monthly_avg_overall'] = np.around(
    df_by_org_monthly.spending_BGN - df_by_org_monthly.monthly_avg_overall, 2
)



In [None]:
# df_by_org_monthly[
#     (df_by_org_monthly.year==2021) & (df_by_org_monthly.month==12)
# ].sort_values(
#     'deviation_from_monthly_avg_yearly',
#     ascending=False
# ).reset_index(drop=True)

In [None]:
# df_by_org_monthly[
#     (df_by_org_monthly.year==2021) & (df_by_org_monthly.month==12)
# ].sort_values(
#     'deviation_from_monthly_avg_yearly',
#     ascending=False
# ).reset_index(drop=True).to_csv("monthly_deviations_from_mean_by_organization.csv", index=False)