In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [7]:
LABELED_PATH = 'data/labels.parquet'

In [None]:
labeled_df = pd.read_parquet(LABELED_PATH)
print(labeled_df.shape)
labeled_df.head()

In [None]:
def plot_cumsum_demand(labeled_df, target_demand):
    grouped_df = labeled_df.groupby('Location')['Demand'].sum().reset_index().sort_values(by='Demand', ascending=False).reset_index(drop=True)
    cumulative_demand = (grouped_df['Demand'].cumsum() / grouped_df['Demand'].sum()).to_numpy()
    
    plt.plot(range(1,263), cumulative_demand)
    plt.xlabel('Location numbers')
    plt.ylabel('Cumulative Demand')
    plt.title('Cumulative Demand by Location')
    target_x = (cumulative_demand <= target_demand).argmin()
    plt.axhline(y=target_demand, color='r', linestyle='--', label=f'Target Demand ({target_demand})')
    plt.axvline(x=target_x, color='g', linestyle='--', label=f'Target Location ({target_x})')
    
    plt.legend() 
    plt.show()
    important_locations = grouped_df.iloc[:target_x, grouped_df.columns.get_loc('Location')]
    important_locations = important_locations.to_numpy()
    return important_locations

In [None]:
important_locations = plot_cumsum_demand(labeled_df, 0.97)

In [None]:
def plot_cumsum_demand_LocationId(labeled_df, target_demand):
    grouped_df = labeled_df.groupby('Location')['Demand'].sum().reset_index().sort_values(by='Demand', ascending=False).reset_index(drop=True)
    cumulative_demand = (grouped_df['Demand'].cumsum() / grouped_df['Demand'].sum()).to_numpy()
    
    fig = plt.figure(figsize=(50,10))
    ax1 = fig.add_subplot(111)
    ax1.plot(range(1,len(cumulative_demand)+1), cumulative_demand)
    ax1.set_xticks(range(1,263), grouped_df['Location'], rotation=90)
    ax1.set_xlabel('LocationID', fontsize=20)
    ax1.autoscale(enable=True, axis='x', tight=True)
    
    
    x = np.arange(0, len(cumulative_demand)+1, 50)
    ax2 = ax1.twiny()
    ax2.set_xlim(ax1.get_xlim())
    ax2.set_xticks(x, x, rotation=90, fontsize=20)
    ax2.set_xlabel('Location numbers', fontsize=20)
    ax2.set_ylabel('Cumulative Demand', fontsize=25)
    ax2.set_title('Cumulative Demand by Location', fontsize=27)
    ax1.tick_params(axis='both', labelsize=13)
    
    
    target_x = (cumulative_demand <= target_demand).argmin()
    plt.axhline(y=target_demand, color='r', linestyle='--', label=f'Target Demand ({target_demand})')
    plt.axvline(x=target_x, color='g', linestyle='--', label=f'Target LocationID <-({grouped_df.Location[target_x-1]})')
    
    
    plt.legend(loc='lower right', fontsize = '20') 
    plt.show()
    return grouped_df

In [None]:
sort_demand_loc = plot_cumsum_demand_LocationId(labeled_df, 0.97)
sort_demand_loc[45:55]

As we see, There is huge gap between LocationID24 and LocationID70 but for sure we choose LocationIDs which their sum of demands are more than LocationID41


In [None]:
print(f'important_locations: {important_locations}')

In [None]:
def zero_demand_location(labeled_df):
    
    ind_low_demand_loc = labeled_df.groupby('Location')['Demand'].min()==0
    low_demand_location = ind_low_demand_loc[ind_low_demand_loc==True].index
    number_low_demand_location = len(low_demand_location)+3
    percentage_low_demand_location = number_low_demand_location*100/265
    
    print(f'number of locations that have at least one zero demand: {number_low_demand_location}')
    print(f'percentage of locations that have at least one zero demand: {percentage_low_demand_location}')

    low_demand = {}
    zero_demand_loc = [i for i in range(1,266) if i not in labeled_df.Location.values]
    low_demand['zero_demand_location'] = list(low_demand_location) + zero_demand_loc
    low_demand['total_demand'] = [labeled_df.Demand[labeled_df.Location==i].sum() for i in low_demand_location] + [0,0,0]
    low_demand = pd.DataFrame(low_demand).sort_values(by='total_demand', ascending=False).reset_index(drop=True)
    
    return low_demand

In [None]:
low_demand = zero_demand_location(labeled_df)
low_demand.head()