In [None]:
import numpy as np
import pandas as pd

from scipy.stats import mode

from tqdm import tqdm

pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Loading

In [None]:
BASE_URI = "D:\\Dropbox\\Aktuális munkák\\OH\\Kreditelorehaladas\\ADAT\\"
BASE = pd.read_excel(BASE_URI + '858_1024_B_2011_12_v2.xlsx')

In [None]:
BASE.head()

# Preparing

In [None]:
def getcol(feature, yearcode):
    return ('HKI_{feature}_{yearcode}'
            .format(feature=feature, yearcode=yearcode))


def yearcode_from_date(date):
    if pd.isnull(date):
        return '9999'
    
    month = int(date.month)
    year = int(date.year) - int(month < 8)
    sem = '2' if month < 8 else '1'

    return '{}2{}'.format(str(year)[-2:], sem)


def reg_yearcode_from_date(date, start, end):
    if pd.isnull(date):
        return '9999'
    
    month = int(date.month)
    year = int(date.year) - int(start <= month <= end)
    sem = '2' if start <= month <= end else '1'
    
    return '{}2{}'.format(str(year)[-2:], sem)


def get_vegzes_kat(row, yearcode):
    start = int(yearcode_from_date(row.HKPZ_KEZDDAT))
    end = int(reg_yearcode_from_date(row.HKPZ_VEGDAT, 5, 10))
    absz = int(reg_yearcode_from_date(row.HKPZ_ABSZ_DAT, 5, 10))
    vizsg = int(reg_yearcode_from_date(row.HKPZ_KIVIZSGA_DAT, 5, 10))
    act = int(yearcode)
    
    if act < start:
        return 'nemkezdte'
    
    if act > vizsg:
        return 'zarovizsga'
    
    if act > absz:
        return 'abszolutorium'
        
    if act > end:
        return 'egyeb'
    
    statusz = row[getcol('STATUSZ', yearcode)]
    if statusz == 'A':
        return 'aktiv'
    if statusz == 'P':
        return 'passziv'
    
    return 'admin_hiba'


def get_credit_cat(credit):
    if credit < 0:
        return '<0'
    if pd.isnull(credit) or credit == 0:
        return '0'
    if credit < 36:
        return '1-35'
    if credit < 61:
        return '36-60'
    
    return '>60'


def count_decreases(row, tipus):
    decs = 0
    for act, prev in zip(yearcodes[1:], yearcodes[:-1]):
        actcol = row[getcol(f'OSSZ_{tipus}_KREDIT', act)]
        prevcol = row[getcol(f'OSSZ_{tipus}_KREDIT', prev)]
        if pd.isnull(actcol):
            break
        if actcol < prevcol:
            decs += 1
    return decs


def sum_credits(row, col):
    yearcode = reg_yearcode_from_date(row[col], 5, 10)
    if int(yearcode) < 1121 or int(yearcode) > 1622:
        return 'null'
        
    credits = (row[getcol('OSSZ_MEGSZ_KREDIT', yearcode)] + 
               row[getcol('OSSZ_ELISM_KREDIT', yearcode)])
    target = row.MKPZ_OSSZIDO * 30
    ratio = credits / target
    
    if 1. <= ratio <= 1.1:
        return 'normal'
    
    if ratio > 1.1:
        return 'magas'
    
    return 'alacsony'

In [None]:
yearcodes = [f'{y}2{s}' for y in range(11, 17) for s in [1, 2]]

In [None]:
for yearcode in tqdm(yearcodes):
    BASE[f'STATUSZ_{yearcode}'] = BASE.apply(get_vegzes_kat, args=(yearcode,), axis=1)

In [None]:
BASE[f'MEGSZ_KREDIT_1121'] = BASE[getcol('OSSZ_MEGSZ_KREDIT', '1121')]
BASE[f'MEGSZ_KREDIT_KAT_1121'] = BASE[f'MEGSZ_KREDIT_1121'].apply(get_credit_cat)

BASE[f'ELISM_KREDIT_1121'] = BASE[getcol('OSSZ_ELISM_KREDIT', '1121')]
BASE[f'ELISM_KREDIT_KAT_1121'] = BASE[f'ELISM_KREDIT_1121'].apply(get_credit_cat)

for act, prev in zip(yearcodes[1:], yearcodes[:-1]):
    BASE[f'MEGSZ_KREDIT_{act}'] = BASE[getcol('OSSZ_MEGSZ_KREDIT', act)] - BASE[getcol('OSSZ_MEGSZ_KREDIT', prev)].fillna(0)
    BASE[f'MEGSZ_KREDIT_KAT_{act}'] = BASE[f'MEGSZ_KREDIT_{act}'].apply(get_credit_cat)
    
    BASE[f'ELISM_KREDIT_{act}'] = BASE[getcol('OSSZ_ELISM_KREDIT', act)] - BASE[getcol('OSSZ_ELISM_KREDIT', prev)].fillna(0)
    BASE[f'ELISM_KREDIT_KAT_{act}'] = BASE[f'ELISM_KREDIT_{act}'].apply(get_credit_cat)

In [None]:
BASE['MEGSZ_DECREASE'] = BASE.apply(count_decreases, axis=1, args=('MEGSZ',))
BASE['ELISM_DECREASE'] = BASE.apply(count_decreases, axis=1, args=('ELISM',))

In [None]:
BASE['KREDIT_ABSZ'] = BASE.apply(sum_credits, axis=1, args=('HKPZ_ABSZ_DAT',))
BASE['KREDIT_VIZSG'] = BASE.apply(sum_credits, axis=1, args=('HKPZ_KIVIZSGA_DAT',))

---
# Saving

In [None]:
with pd.ExcelWriter(BASE_URI + 'kredit_statisztikak.xlsx') as writer:
    for yearcode in tqdm(yearcodes):
        for tipus in ['MEGSZ', 'ELISM']:
            tmp = (BASE
                   .groupby(f'STATUSZ_{yearcode}')
                   [f'{tipus}_KREDIT_{yearcode}']
                   .agg(['count', 'min', 'mean', 'median', 'max']))
            tmp['N'] = BASE[f'STATUSZ_{yearcode}'].count()
            tmp['feltöltöttség'] = (tmp['count'] / tmp.N * 100).apply(lambda x: '{:.2f}%'.format(x))
            (tmp
             [['N', 'count', 'feltöltöttség','min', 'mean', 'median', 'max']]
             .to_excel(writer, sheet_name=f'{tipus}_KREDITSTAT_{yearcode}'))

            (BASE
             .groupby([f'STATUSZ_{yearcode}', f'{tipus}_KREDIT_KAT_{yearcode}'])
             .HLG_SK.count()
             .to_frame()
             .rename(columns={'HLG_SK': 'elofordulas'})
             .to_excel(writer, sheet_name=f'{tipus}_KREDIT_KATSTAT_{yearcode}'))


In [None]:
def aggregate(yearcode, grouping):
    out = {}
    for tipus in ['MEGSZ', 'ELISM']:
        out[f'{tipus}_STAT_{yearcode}'] = [
            BASE
            .groupby(grouping + [f'STATUSZ_{yearcode}'])
            [f'{tipus}_KREDIT_{yearcode}']
            .agg(['count', 'min', 'mean', 'median', 'max'])
        ]
        
        out[f'{tipus}_KATSTAT_{yearcode}'] = [
            BASE
            .groupby(grouping + [f'STATUSZ_{yearcode}', f'{tipus}_KREDIT_KAT_{yearcode}'])
            .HLG_SK.count()
            .to_frame()
            .rename(columns={'HLG_SK': 'occurences'})
        ]
        
    return out

In [None]:
with pd.ExcelWriter(BASE_URI + 'kredit_statisztikak_meg_bontva.xlsx') as writer:
    for yearcode in tqdm(yearcodes):
        dfs = aggregate(yearcode, [])
        for grouping in tqdm(['SZER_KOD', 'HKPZ_MUNKAREND_SZO_HNEV', 'HKPZ_KEL_TIPUS_NEV', 'HKI_FINANSZIROZAS_SZO_HNEV_{}']):
            grouping = grouping.format(yearcode)
            result = aggregate(yearcode, [grouping])
            for key, value in result.items():
                dfs[key] += value

        for key, values in dfs.items():
            row = 0
            for df in values:
                df.to_excel(writer, sheet_name=key, startrow=row , startcol=0)   
                row = row + len(df.index) + 4


---

In [None]:
def sum_decrease(group):
    df = pd.DataFrame({
        'type': ['megszerzett', 'elismert'],
        'occurences': [group[f'MEGSZ_DECREASE'].sum(),
                       group[f'ELISM_DECREASE'].sum()],
        'N': [group[f'MEGSZ_DECREASE'].count(),
              group[f'ELISM_DECREASE'].count()],
    }).set_index('type')
    df['ratio'] = (df.occurences / df.N).apply(lambda x: '{:.2f}%'.format(x*100))
    return df[['occurences', 'N', 'ratio']]

In [None]:
row = 0
with pd.ExcelWriter(BASE_URI + 'kredit_csokkenes.xlsx') as writer:
    df = BASE.groupby(lambda _ : True).apply(sum_decrease)
    df.to_excel(writer, sheet_name='csokkenes', startrow=row , startcol=0)
    row = row + len(df.index) + 4
    for grouping in tqdm(['SZER_KOD', 'HKPZ_MUNKAREND_SZO_HNEV', 'HKPZ_KEL_TIPUS_NEV']):
        df = BASE.groupby(grouping).apply(sum_decrease)
        df.to_excel(writer, sheet_name='csokkenes', startrow=row , startcol=0)
        row = row + len(df.index) + 4

---

In [None]:
def count_credit_ratio(group, tipus):
    df = group.groupby(f'KREDIT_{tipus}').HLG_SK.count().to_frame().rename(columns={'HLG_SK': 'occurences'})
    df['N'] = df.occurences.sum()
    df['ratio'] = (df.occurences / df.N).apply(lambda x: '{:.2f}%'.format(x*100))
    return df[['occurences', 'N', 'ratio']]

In [None]:
with pd.ExcelWriter(BASE_URI + 'kreditszam.xlsx') as writer:
    for tipus in ['ABSZ', 'VIZSG']:
        row = 0
        df = BASE.groupby(lambda _ : True).apply(count_credit_ratio, tipus=tipus)
        df.to_excel(writer, sheet_name=tipus, startrow=row , startcol=0)
        row = row + len(df.index) + 4
        
        for grouping in tqdm(['SZER_KOD', 'HKPZ_MUNKAREND_SZO_HNEV', 'HKPZ_KEL_TIPUS_NEV']):
            df = BASE.groupby(grouping).apply(count_credit_ratio, tipus='ABSZ')
            df.to_excel(writer, sheet_name=tipus, startrow=row , startcol=0)
            row = row + len(df.index) + 4


In [347]:
with pd.ExcelWriter(BASE_URI + 'kredit_evfolyam.xlsx') as writer:
    for tipus in ['MEGSZ', 'ELISM']:
        for yc in yearcodes:
            (BASE
             .groupby(getcol('AKTIV_EVFOLYAM', yc))
             [getcol(f'OSSZ_{tipus}_KREDIT', yc)]
             .describe()
             .to_excel(writer, sheet_name=f'{tipus}_{yc}'))