In [1]:
import altair as alt
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
alt.renderers.enable('notebook')
plt.style.use('ggplot')

In [3]:
status_files = glob.iglob('../pune_samples/case_status/*.csv')
dataframes = [pd.read_csv(f, index_col='CNR Number', parse_dates=True) for f in status_files]
df = pd.concat(dataframes, sort=False)

In [None]:
history_files = glob.iglob('../pune_samples/case_history/*.csv')
dataframes = [pd.read_csv(f, parse_dates=True, index_col='CNR') for f in history_files]
listing_df = pd.concat(dataframes, sort=False)

In [None]:
# column formatting
to_category = ['Case Type', 'Stage of Case', 'Under Act(s)', 'Judge',
               'Court Number', 'Under Section(s)', 'Nature of Disposal']
df[to_category] = df[to_category].astype('category')
to_date = ['Registration Date', 'First Hearing Date', 'Last Hearing Date',
           'Transfer Date', 'Next Hearing Date', 'Decision Date', 'sub_court_DecisionDate']
df[to_date] = df[to_date].apply(pd.to_datetime, errors='coerce')

In [None]:
listing_to_cat = ['Judge', 'Purpose of hearing']
listing_to_date = ['Business On Date', 'Hearing Date']
listing_df[listing_to_cat] = listing_df[listing_to_cat].astype('category')
listing_df[listing_to_date] = listing_df[listing_to_date].apply(pd.to_datetime, errors='coerce')

In [None]:
cnrs = list(df.index.values)
count_series = pd.DataFrame(listing_df.loc[cnrs].groupby('CNR').size(), columns=['no_proceedings'])

In [None]:
# add proceedings column to df
df = pd.concat([df, count_series], axis=1, sort=False)

In [None]:
decision_days = (df['Decision Date'] - df['First Hearing Date']).dt.days
df['decision_days'] = decision_days

In [None]:
# proceedings frequency table
proceeding_freq = pd.DataFrame(df['no_proceedings'].value_counts())
proceeding_freq.index.name = 'no of cases'
proceeding_freq = proceeding_freq.reset_index()
proceeding_freq.columns = ['no_proceedings', 'no of cases']

In [None]:
alt.Chart(
    proceeding_freq,
    width=600,
    height=300
).mark_point().encode(
    x=alt.X(
        'no_proceedings:Q',
        axis=alt.Axis(title='No of proceedings done')
    ),
    y=alt.Y(
        'no of cases:Q',
        axis=alt.Axis(title='No of cases taking proceedings')
    )
).properties(
    title='Proceedings wise case statistics'
)

In [None]:
# most common proceeding frequencies grouped by case types.
mode = lambda x: x.mode()  if len(x.mode()) < 2 else max(x.mode())
mode_proceedings_per_case_type = df.groupby('Case Type', sort=False)['no_proceedings'].agg(mode)
alt.Chart(
    mode_proceedings_per_case_type.reset_index(),
    width=500, height=300
).mark_bar().encode(
    y=alt.Y(
        'Case Type:O',
        sort=alt.EncodingSortField(field='no_proceedings', op='sum', order='descending'),
        axis=alt.Axis(title='Case Type')
    ),
    x=alt.X(
        'no_proceedings:Q',
        axis=alt.Axis(title='No of proceedings')
    ),
    color=alt.Color(
        'Case Type:O',
        scale=alt.Scale(range=sns.color_palette('Blues', 12).as_hex()),
        sort=alt.EncodingSortField(field='no_proceedings', op='sum', order='ascending')
    )
).properties(
    title='Case Type wise most occuring proceeding counts.'
)

In [None]:
# most common proceeding frequencies grouped by case types.
max_proceedings_per_case_type = df.groupby('Case Type', sort=False)['no_proceedings'].max()
alt.Chart(
    max_proceedings_per_case_type.reset_index(),
    width=500, height=300
).mark_bar().encode(
    y=alt.Y(
        'Case Type:O',
        sort=alt.EncodingSortField(field='no_proceedings', op='sum', order='descending'),
        axis=alt.Axis(title='Case Type')
    ),
    x=alt.X(
        'no_proceedings:Q',
        axis=alt.Axis(title='No of proceedings')
    ),
    color=alt.Color(
        'Case Type:O',
        scale=alt.Scale(range=sns.color_palette('GnBu_d', 12).as_hex()),
        sort=alt.EncodingSortField(field='no_proceedings', op='sum', order='descending')
    )
).properties(
    title='Case Type wise maximum proceedings per case.'
)

In [None]:
sns.set(rc={'figure.figsize':(13, 8)})
sns.regplot(x='decision_days', y='no_proceedings', data=df)
plt.title('Corelation between no of proceedings and time taken in decision',
          fontsize=12, fontweight='bold')
plt.xlabel('Days taken in decision', fontsize=10, fontweight='bold')
plt.ylabel('No of Proceedings', fontsize=10, fontweight='bold')
plt.show()