In [None]:
from pathlib import Path
from typing import Dict, List, Union, Iterable, Tuple  
import json
import pandas as pd
def load_parquet_dict(data_folder: str) -> dict[str, pd.DataFrame]:
    p = Path(data_folder)
    with open(p / "_dtype_book.json", "r", encoding="utf-8") as f:
        book = json.load(f)

    out = {}
    for fp in sorted(p.glob("*.parquet")):
        if fp.name.startswith("_"):
            continue
        name = fp.stem
        df = pd.read_parquet(fp)
        spec = book.get(name)
        if spec:
            df = df.astype(spec)
        else:
            raise Exception(f"{name} in dtype_book.json is {spec}")
        out[name.lower()] = df
    return out

dfs = load_parquet_dict("data/cleaned_result")
dfs

In [None]:
print(dfs['tasks'].columns.tolist())

In [None]:
dfs['tasks']['APPOINTMENTSTART'].value_counts()

In [None]:
dfs['tasks']['DURATION'] = dfs['tasks']['COMPLETIONTIMESTAMP'] - dfs['tasks']['ONSITETIMESTAMP']

In [None]:
dfs['tasks']['TASKTYPE']

In [None]:
dfs['tasks']['DEPARTMENT']

In [None]:
import dask.dataframe as dd

In [None]:
ddf = dd.from_pandas(dfs['tasks'], npartitions=4)

In [None]:
ddf_gas = ddf[ddf['DEPARTMENT'] == 'SDGE-GAS'].persist()

In [None]:
ddf_gas['DEPARTMENT'].head()

In [None]:
ddf_gas.to_csv("./data/cleaned_result/GAS_DATA.csv", single_file=True, index=False)

In [None]:
import pandas as pd 
df_gas = pd.read_csv('./data/cleaned_result/GAS_DATA.csv')

In [None]:
df_gas.head()

In [None]:
print(df_gas.columns.tolist())

In [None]:
df_gas.dtypes

In [None]:
df_gas[['TIMECREATED', 'DURATION', 'DURATION_TASKTYPE', 'DUEDATE', 'SCHEDULEDSTART', 'SCHEDULEDFINISH', 'ONSITETIMESTAMP', 'COMPLETIONTIMESTAMP']]

In [None]:
df_gas['DURATION'] = df_gas['DURATION'] / 3600

In [None]:
df_gas['DURATION_TASKTYPE'] = df_gas['DURATION_TASKTYPE'] / 3600

In [None]:
df_gas.shape

In [None]:
df_gas['TASKTYPE']

In [None]:
data = df_gas.groupby('TASKTYPE')['DURATION'].mean().reset_index().sort_values(by='DURATION', ascending=False).head(10)

# Duration by Work Type

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 5))
sns.barplot(data=data, x='TASKTYPE', y='DURATION')
plt.title("Top 10 Longest Duration for Tasks (GAS)")
plt.ylabel("Average Duration (hours)")
plt.xlabel("Work Type")
plt.xticks(fontsize=8, rotation=90)
plt.show()

In [None]:
df_gas['DURATION'].dtype

In [None]:
def mark_outliers(group):
    Q1 = group['DURATION'].quantile(0.25)
    Q3 = group['DURATION'].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    group['FAST_COMPLETION'] = group['DURATION'] < lower
    group['SLOW_COMPLETION'] = group['DURATION'] > upper
    group['is_outlier'] = (group['DURATION'] < lower) | (group['DURATION'] > upper)
    return group

df_gas_group = df_gas.groupby('TASKTYPE', group_keys=False).apply(mark_outliers)


In [None]:
df_fast = df_gas_group[df_gas_group['FAST_COMPLETION']]
df_fast.head()

In [None]:
df_fast = df_fast.groupby('TASKTYPE')['DURATION'].mean().reset_index().sort_values(by='DURATION')

In [None]:
df_slow = df_gas_group[df_gas_group['SLOW_COMPLETION']]

In [None]:
df_slow = df_slow.groupby('TASKTYPE')['DURATION'].mean().reset_index().sort_values(by='DURATION')

In [None]:
df_tasks_duration = df_gas_group.groupby('TASKTYPE')['DURATION'].mean().reset_index()

In [None]:
df_tasks_duration = df_tasks_duration.merge(df_fast, how='left', on='TASKTYPE')

In [None]:
df_tasks_duration = df_tasks_duration.merge(df_slow, how='left', on='TASKTYPE')

In [None]:
df_tasks_duration.rename(columns={'DURATION_x': 'ACTUAL_DURATION (HOURS)', 'DURATION_y': 'FAST_COMPLETION (HOURS)', 'DURATION': 'SLOW_COMPLETION (HOURS)'}, inplace=True)

In [None]:
df_non_outliers = df_gas_group[~df_gas_group['is_outlier']]

In [None]:
df_gas_nonoutliers = df_non_outliers.groupby('TASKTYPE')['DURATION'].mean().reset_index()

In [None]:
df_gas_nonoutliers

In [None]:
df_gas_nonoutliers

In [None]:
df_tasks_duration

In [None]:
df = df_tasks_duration.merge(df_gas_nonoutliers, how='left', on='TASKTYPE')

In [None]:
df.head()

In [None]:
df.head()

In [None]:
df.rename(columns={'DURATION': 'DURATION_NO_OUTLIERS (HOURS)'}, inplace=True)

In [None]:
df.head()

In [None]:
df_hours = df.sort_values(by='ACTUAL_DURATION (HOURS)', ascending=False).head(20)
df_hours

In [None]:
df_gas[df_gas['TASKTYPE'] == 'GAS IMPROVEMENT OF SERVICE SDG CPD']['DURATION_TASKTYPE']

In [None]:
df_gaming = df[~df['FAST_COMPLETION (HOURS)'].isna()].sort_values(by='FAST_COMPLETION (HOURS)').head(10)

In [None]:
df_gaming

# Potential Gaming

In [None]:
df_melt = df_gaming.sort_values(by='ACTUAL_DURATION (HOURS)', ascending=False).melt(id_vars='TASKTYPE', value_vars=['ACTUAL_DURATION (HOURS)', 'FAST_COMPLETION (HOURS)'],
                  var_name='Type', value_name='Duration (hrs)')

plt.figure(figsize=(12,6))
sns.barplot(x='Duration (hrs)', y='TASKTYPE', hue='Type', data=df_melt, orient='h')
plt.title("Tasks Completed Unusually Fast")
plt.xlabel("Duration (hours)")
plt.ylabel("Task Type")
plt.legend(title="Duration Type")
plt.tight_layout()
plt.show()

In [None]:
df_gas_group.head()

In [None]:
df_no_slow = df_gas_group[~df_gas_group['SLOW_COMPLETION']]

In [None]:
df_no_slow = df_no_slow.groupby('TASKTYPE')['DURATION'].mean().reset_index()

In [None]:
df_no_slow

In [None]:
df_no_slow = df_hours.merge(df_no_slow, how='left', on='TASKTYPE')

In [None]:
df_no_slow.rename(columns={'DURATION': 'DURATION_NO_SLOW (HOURS)'}, inplace=True)

In [None]:
df_no_slow

In [None]:
df_melt = df_no_slow.sort_values(by='ACTUAL_DURATION (HOURS)', ascending=False).melt(id_vars='TASKTYPE', value_vars=['ACTUAL_DURATION (HOURS)', 'DURATION_NO_SLOW (HOURS)'],
                  var_name='Type', value_name='Duration (hrs)')

plt.figure(figsize=(12,6))
sns.barplot(x='Duration (hrs)', y='TASKTYPE', hue='Type', data=df_melt, orient='h')
plt.title("Tasks Without Late Logs")
plt.xlabel("Duration (hours)")
plt.ylabel("Task Type")
plt.legend(title="Duration Type")
plt.tight_layout()
plt.show()

# Impact on Actual Duration / Expected Duration

In [None]:
df_melt = df_hours.melt(id_vars='TASKTYPE', value_vars=['ACTUAL_DURATION (HOURS)', 'DURATION_NO_OUTLIERS (HOURS)'],
                  var_name='Type', value_name='Duration (hrs)')

plt.figure(figsize=(12,6))
sns.barplot(x='Duration (hrs)', y='TASKTYPE', hue='Type', data=df_melt, orient='h')
plt.title("Task Duration Comparison (With vs Without Outliers)")
plt.xlabel("Duration (hours)")
plt.ylabel("Task Type")
plt.legend(title="Duration Type")
plt.tight_layout()
plt.show()

In [None]:
# Percentage of amount of outliers in tasks
df_gas_group.groupby('TASKTYPE')['is_outlier'].mean().sort_values(ascending=False)

In [None]:
summary = (
    df_gas_group.groupby('is_outlier')
      .agg({
          'DURATION': ['mean', 'median', 'count'],
          'NUMBEROFREQUIREDENGINEERS': 'mean',
          'REQUIREDCREWSIZE': 'mean',
          'SEMPRAEMERGENCY': 'mean',
          'REGION': pd.Series.nunique,
          'TASKTYPE': pd.Series.nunique
      })
)

print(summary.to_dict())

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'REML PRTN POLE RPLMNT CMP SDE CPD'].iloc[:10, :10]

In [None]:
df_planned = df_gas_group.copy()

In [None]:
df_planned['TIME_DIFFERENCE'] = df_planned['DURATION'] - df_planned['DURATION_TASKTYPE']

In [None]:
df_planned['TIME_DIFFERENCE']

In [None]:
df_planned = df_planned.groupby('TASKTYPE')['TIME_DIFFERENCE'].sum().reset_index().sort_values(by='TIME_DIFFERENCE', ascending=False).head(10)

# Tasks With Largest Difference Between Actual and Expected Duration

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=df_planned, x='TASKTYPE', y='TIME_DIFFERENCE')
plt.title("Top 10 Greatest Difference Between Actual and Expected Duration")
plt.ylabel("Total Difference (hours)")
plt.xlabel("Work Type")
plt.xticks(fontsize=8, rotation=90)
plt.show()

In [None]:
print(df_gas_group.columns.tolist())

In [None]:
df_gas_group.iloc[:5, 150:]

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'REML PRTN POLE RPLMNT CMP SDE CPD'][['PRIORITY_TASKTYPE', 'DISTRICT', 'TIMECREATED', 'MODIFIEDBY', 'TIMEMODIFIED','EARLYSTART', 'DUEDATE', 'LATESTART', 'SCHEDULEDSTART', 'SCHEDULEDFINISH', 'ONSITETIMESTAMP', 'COMPLETIONTIMESTAMP', 'is_outlier'] ]

In [None]:
df_gas_group['SCHEDULEDSTART'] = pd.to_datetime(df_gas_group['SCHEDULEDSTART'], errors='coerce')

In [None]:
df_gas_group['YEAR_MONTH'] = df_gas_group['SCHEDULEDSTART'].dt.to_period('M').astype(str)

In [None]:
tasks_per_month = (
    df_gas_group.groupby('YEAR_MONTH')
      .size()
      .reset_index(name='task_count')
      .sort_values('YEAR_MONTH')
)

In [None]:
tasks_per_month['YEAR_MONTH'] = pd.to_datetime(tasks_per_month['YEAR_MONTH'])


In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(
    data=tasks_per_month,
    x='YEAR_MONTH',
    y='task_count',
    marker='o'
)
plt.title('Task Count Over Time (Month + Year) (GAS)')
plt.xlabel('Month')
plt.ylabel('Number of Tasks')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
df_gas_group['year'] = df_gas_group['SCHEDULEDSTART'].dt.year
df_gas_group['month'] = df_gas_group['SCHEDULEDSTART'].dt.month_name()

In [None]:
heatmap_data = (
    df_gas_group.groupby(['year', 'month'])
      .size()
      .reset_index(name='task_count')
)


In [None]:
month_order = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]

In [None]:
heatmap_data['month'] = pd.Categorical(heatmap_data['month'], categories=month_order, ordered=True)

In [None]:
pivot_table = heatmap_data.pivot(index='month', columns='year', values='task_count')


In [None]:
# Plot
plt.figure(figsize=(10,7))
sns.heatmap(pivot_table, annot=True, fmt=".0f", cmap="YlGnBu")
plt.title("Monthly Task Count by Year (Gas)")
plt.xlabel("Year")
plt.ylabel("Month")
plt.tight_layout()
plt.show()

In [None]:
print(df_gas_group.columns.tolist())

In [None]:
df_gas_group.iloc[:5,130:140]

In [None]:
df_gas_group['USEDINMOBILE'].value_counts()

In [None]:
import scipy.stats as stats

contingency = pd.crosstab(df_gas_group['DISTRICT'], df_gas_group['is_outlier'])

# Perform Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency)

print(f"Chi-square statistic: {chi2:.3f}, p-value: {p:.4f}")

In [None]:
results = {}

In [None]:
for g, sub in df_gas_group.groupby('TASKTYPE'):
    contingency = pd.crosstab(sub['DISTRICT'], sub['is_outlier'])
    chi2, p, _, _ = stats.chi2_contingency(contingency)
    results[g] = p
    print(g, "p-value:", p)

In [None]:
test =  pd.DataFrame(list(results.items()), columns=['TASKTYPE', 'P'])

In [None]:
test.sort_values(by='P').head(50)

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'GAS MASTER SERVICE SDG'].shape[0]

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG'].shape[0]

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'MISC UG CORR MAINT PROG SDE CPD'].shape[0]

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'REPL REINFRC EXIST FACIL UG SDE CPD'].shape[0]

In [None]:
#'REML PRTN POLE RPLMNT CMP SDE CPD'

In [None]:
results = []

# Loop through each task type
for g, sub in df_gas_group.groupby('TASKTYPE'):
    contingency = pd.crosstab(sub['DISTRICT'], sub['is_outlier'])
    
    # Skip empty or invalid tables
    if contingency.shape[0] < 2 or contingency.shape[1] < 2:
        continue
    
    chi2, p, dof, expected = stats.chi2_contingency(contingency)
    results.append({
        'TASKTYPE': g,
        'chi2': chi2,
        'p_value': p,
        'dof': dof,
        'num_districts': contingency.shape[0],
        'num_outlier_states': contingency.shape[1]
    })


In [None]:
# EC CALIBRATION SDG                   0.395642
# LEAK SURVEY PUBLIC SAFETY SDG        0.375569
# OH R20A CONVERSION SDG               0.375000
# UG CUST NEW BUSINESS SDG             0.322034

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'EC CALIBRATION SDG'].shape[0]

In [None]:
df_gas_group[(df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG') & (df_gas_group['DISTRICT'] == 'BEACH CITIES-GAS')].shape[0] / df_gas_group[df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG'].shape[0]

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'LEAK SURVEY PUBLIC SAFETY SDG'].shape[0]

In [None]:
(df_gas_group[df_gas_group['TASKTYPE'] == 'LEAK SURVEY PUBLIC SAFETY SDG']
 .groupby('DISTRICT')['is_outlier']
 .mean()
 .sort_values(ascending=False))

In [None]:
df_gas_group.iloc[:5, -5:]

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG']['DURATION'].value_counts()

In [None]:
(df_gas_group[df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG']
 .groupby('DISTRICT')['SLOW_COMPLETION']
 .mean()
 .sort_values(ascending=False))

In [None]:
df_hours.head(20)

In [None]:
df_hours['DIFFERENCE'] = df_hours['ACTUAL_DURATION (HOURS)'] - df_hours['DURATION_NO_OUTLIERS (HOURS)']

In [None]:
df_hours2 = df_hours.sort_values(by='DIFFERENCE', ascending=False).head(20)

In [None]:
final_test = df_hours2.merge(test, how='inner', on='TASKTYPE')

In [None]:
final_test

In [None]:
# Assuming your dataframe is called df_gas_group
plt.figure(figsize=(12,6))

sns.barplot(
    x='TASKTYPE',
    y='DIFFERENCE',  # the column representing difference in durations
    data=final_test,
    ci=None  # removes error bars (optional)
)

# Rotate x-axis labels so they are readable
plt.xticks(rotation=90)

# Add title and axis labels
plt.title("Difference in Task Duration per Task Type")
plt.xlabel("Task Type")
plt.ylabel("Difference in Hours")

plt.tight_layout()
plt.show()

In [None]:
test[test['TASKTYPE'] == 'GAS DIST REPLACE MAIN OR SERVICES SDG CPD']['P']

In [None]:
(df_gas_group[df_gas_group['TASKTYPE'] == 'GAS IMPROVEMENT OF SERVICE SDG CPD']
 .groupby('DISTRICT')['is_outlier']
 .mean()
 .sort_values(ascending=False))

In [None]:
df_gas_group[(df_gas_group['TASKTYPE'] == 'GAS IMPROVEMENT OF SERVICE SDG CPD') & (df_gas_group['DISTRICT'] == 'METRO-GAS')]['SLOW_COMPLETION'].mean()

In [None]:
df_gas_group.iloc[:5, -10:]

In [None]:
df_gas_group[(df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG') & (df_gas_group['DISTRICT'] == 'METRO-GAS')]['is_outlier'].mean()

In [None]:
sns.barplot(
    x='DISTRICT',
    y='is_outlier',
    data=df_gas_group[df_gas_group['TASKTYPE'] == 'GAS IMPROVEMENT OF SERVICE SDG CPD'],
    estimator=lambda x: sum(x)/len(x)
)
plt.xticks(rotation=90)
plt.title("Outlier Rate by District for GAS IMPROVEMENT OF SERVICE SDG CPD")
plt.show()

In [None]:
(df_gas_group[df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG']
 .groupby('DISTRICT')['is_outlier']
 .mean()
 .sort_values(ascending=False))

In [None]:
sns.barplot(
    x='DISTRICT',
    y='is_outlier',
    data=df_gas_group[df_gas_group['TASKTYPE'] == 'SERVICE MAINTENANCE SDG'],
    estimator=lambda x: sum(x)/len(x)
)
plt.xticks(rotation=90)
plt.title("Outlier Rate by District for SERVICE MAINTENANCE SDG")
plt.show()

In [None]:
df_gas_group[(df_gas_group['DISTRICT'] == 'LEAKAGE MITIGATION NORTH') & (df_gas_group['TASKTYPE'] == 'LEAK SURVEY PUBLIC SAFETY SDG')]['is_outlier'].mean()

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'LEAK SURVEY PUBLIC SAFETY SDG'][['DURATION', 'is_outlier']].value_counts()

In [None]:
sns.barplot(
    x='DISTRICT',
    y='is_outlier',
    data=df_gas_group[df_gas_group['TASKTYPE'] == 'LEAK SURVEY PUBLIC SAFETY SDG'],
    estimator=lambda x: sum(x)/len(x)
)
plt.xticks(rotation=90)
plt.title("Outlier Rate by District for LEAK SURVEY PUBLIC SAFETY SDG")
plt.show()

In [None]:
df_gas_group[df_gas_group['TASKTYPE'] == 'REML PRTN POLE RPLMNT CMP SDE CPD'].shape[0]

In [None]:
(df_gas_group[df_gas_group['TASKTYPE'] == 'REML PRTN POLE RPLMNT CMP SDE CPD']
 .groupby('DISTRICT')['is_outlier']
 .mean()
 .sort_values(ascending=False))

In [None]:
sns.barplot(
    x='DISTRICT',
    y='is_outlier',
    data=df_gas_group[df_gas_group['TASKTYPE'] == 'REML PRTN POLE RPLMNT CMP SDE CPD'],
    estimator=lambda x: sum(x)/len(x)
)

In [None]:
results_df = pd.DataFrame(results)

# Sort by significance
results_df = results_df.sort_values('p_value')

# Filter for significant results
significant = results_df[results_df['p_value'] < 0.05]

In [None]:
contingency = pd.crosstab(df_gas_group['TASKTYPE'], df_gas_group['is_outlier'])

# Perform Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency)

print(f"Chi-square statistic: {chi2:.3f}, p-value: {p:.4f}")