### Date Handling

Also includes performing LEFT JOINS

In [1]:
# Consolidate all imports here
import pandas as pd
import numpy as np
import datetime as dt
from collections import Counter

In [2]:
# Generate weekly calendar
calendar = pd.DataFrame({
    'Week Range': pd.date_range(start='2021-12-01', end=dt.datetime.now()).to_period('W').unique()
})

calendar[['Week Start', 'Week End']] = calendar['Week Range'].astype(str).str.split('/', expand=True)
calendar['Week Start'] = pd.to_datetime(calendar['Week Start'])
calendar.reset_index(inplace=True)
calendar.rename(columns={'index':'Week Number'}, inplace=True)
calendar['Week Number'] = calendar['Week Number'] + 1
calendar = calendar[['Week Range', 'Week Start', 'Week End', 'Week Number']]
calendar.sort_values(by='Week Start', ascending=False, inplace=True)
calendar.reset_index(drop=True, inplace=True)
calendar.rename(columns={'Week Start':'Date Pulled'}, inplace=True)
calendar['Week End'] = pd.to_datetime(calendar['Week End'])
calendar['Date Pulled Year'] = calendar['Date Pulled'].dt.year
calendar = calendar[['Week Range', 'Date Pulled', 'Date Pulled Year', 'Week End', 'Week Number']]
calendar

Unnamed: 0,Week Range,Date Pulled,Date Pulled Year,Week End,Week Number
0,2022-03-14/2022-03-20,2022-03-14,2022,2022-03-20,16
1,2022-03-07/2022-03-13,2022-03-07,2022,2022-03-13,15
2,2022-02-28/2022-03-06,2022-02-28,2022,2022-03-06,14
3,2022-02-21/2022-02-27,2022-02-21,2022,2022-02-27,13
4,2022-02-14/2022-02-20,2022-02-14,2022,2022-02-20,12
5,2022-02-07/2022-02-13,2022-02-07,2022,2022-02-13,11
6,2022-01-31/2022-02-06,2022-01-31,2022,2022-02-06,10
7,2022-01-24/2022-01-30,2022-01-24,2022,2022-01-30,9
8,2022-01-17/2022-01-23,2022-01-17,2022,2022-01-23,8
9,2022-01-10/2022-01-16,2022-01-10,2022,2022-01-16,7


In [3]:
# Create a dataframe of data to join on the calendar
df = pd.DataFrame({
    'Date Pulled': ['2021-11-29', '2021-12-07', '2021-12-13', '2021-12-20', '2021-12-27', '2022-01-03', '2022-01-10', '2022-01-17', '2022-01-25', '2022-01-31', '2022-02-07', '2022-02-15'],
    'Data': ['*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*']
})

df['Date Pulled'] = pd.to_datetime(df['Date Pulled'])
df

Unnamed: 0,Date Pulled,Data
0,2021-11-29,*
1,2021-12-07,*
2,2021-12-13,*
3,2021-12-20,*
4,2021-12-27,*
5,2022-01-03,*
6,2022-01-10,*
7,2022-01-17,*
8,2022-01-25,*
9,2022-01-31,*


In [4]:
# Left join df and calendar
df_merge = df.merge(calendar, how='left', on='Date Pulled')
df_merge['Date Pulled Year'] = df_merge['Date Pulled'].dt.year
df_merge = df_merge[['Date Pulled', 'Date Pulled Year', 'Week End', 'Week Number']]
df_merge['Week End'] = pd.to_datetime(df_merge['Week End'])
display(df_merge)
display(df_merge[df_merge['Week Number'].isnull()])

Unnamed: 0,Date Pulled,Date Pulled Year,Week End,Week Number
0,2021-11-29,2021,2021-12-05,1.0
1,2021-12-07,2021,NaT,
2,2021-12-13,2021,2021-12-19,3.0
3,2021-12-20,2021,2021-12-26,4.0
4,2021-12-27,2021,2022-01-02,5.0
5,2022-01-03,2022,2022-01-09,6.0
6,2022-01-10,2022,2022-01-16,7.0
7,2022-01-17,2022,2022-01-23,8.0
8,2022-01-25,2022,NaT,
9,2022-01-31,2022,2022-02-06,10.0


Unnamed: 0,Date Pulled,Date Pulled Year,Week End,Week Number
1,2021-12-07,2021,NaT,
8,2022-01-25,2022,NaT,
11,2022-02-15,2022,NaT,


How to take rows that did not match on the calendar and line it up to pull in the range
    <br>1) For weeks between the boundaries, check for interruptions in the sequence
    <br>2) For boundaries missing, check the if the index value is the first or last position

In [39]:
# 1) Check for weeks missing between the boundaries; look for interruptions in the sequence
    # Create a dictionary to identify the week number missing

missing_weeks_nonboundary = {}

for y in df_merge['Date Pulled Year'].unique():
    missing_weeks_nonboundary[f'{y}'] = df_merge[df_merge['Date Pulled Year']==y]['Week Number'].dropna().astype(np.int64).unique().tolist()

for l in missing_weeks_nonboundary:
    print(missing_weeks_nonboundary[l])
    missing_weeks_nonboundary[l] = sorted(missing_weeks_nonboundary[l])
    start = missing_weeks_nonboundary[l][0]
    end = missing_weeks_nonboundary[l][-1]
    count_dict = Counter(missing_weeks_nonboundary[l])
    missing_weeks_nonboundary[l] = [i for i in range(start, end) if count_dict[i] == 0]

print(missing_weeks_nonboundary.keys())
print(missing_weeks_nonboundary['2021'])
print(missing_weeks_nonboundary['2022'])

[1, 3, 4, 5]
[6, 7, 8, 10, 11]
dict_keys(['2021', '2022'])
[2]
[9]


In [41]:
# 2) For boundaries missing, check the if the index value is the first or last position
    # Check first and last row index

missing_weeks_boundary = {}

first_index = df_merge.iloc[[0]].index.item()
last_index = df_merge.iloc[[-1]].index.item()

# Get the index values of all rows that didn't match with the calendar
null_entry_index = df_merge[df_merge['Week Number'].isnull()].index.tolist()

# REVIEW THIS - for logic, if boundary index identified, maybe can +1 or -1 from the Week Number range?
for y in df_merge['Date Pulled Year'].unique():
    missing_weeks_boundary[f'{y}'] = df_merge[df_merge['Date Pulled Year']==y]['Week Number'].dropna().astype(np.int64).unique().tolist()

for l in missing_weeks_boundary:
    missing_weeks_boundary[l] = sorted(missing_weeks_boundary[l])
    start = missing_weeks_boundary[l][0]
    end = missing_weeks_boundary[l][-1]

# if first_index in null_entry_index:
#     pass
# else:
#     pass

# if last_index in null_entry_index:
#     pass
# else:
#     pass

# missing_weeks_boundary

#################################################################

# for y in df_merge['Date Pulled Year'].unique():
#     missing_weeks_nonboundary[f'{y}'] = df_merge[df_merge['Date Pulled Year']==y]['Week Number'].dropna().astype(np.int64).unique().tolist()

# for l in missing_weeks_nonboundary:
#     # print(missing_weeks_nonboundary[l])
#     missing_weeks_nonboundary[l] = sorted(missing_weeks_nonboundary[l])
#     start = missing_weeks_nonboundary[l][0]
#     end = missing_weeks_nonboundary[l][-1]
#     count_dict = Counter(missing_weeks_nonboundary[l])
#     missing_weeks_nonboundary[l] = [i for i in range(start, end) if count_dict[i] == 0]


In [None]:
### BELOW ALL REFERENCE - NEED TO CHANGE

In [None]:
# Using the dictionary of missing week numbers, we then create another dictionary that shows the week ranges that are associated with those week numbers

calendar_missing_weeks = {}

for key, data in missing_weeks.items():
    calendar_missing_weeks[f'{key}'] = calendar[
        (calendar['Date Pulled'].dt.year.astype(str) == key) &
        (calendar['Week Number'].isin(data))
    ]

# # Check if dictionary looks good
# display(calendar_missing_weeks['2021'])
# display(calendar_missing_weeks['2022'])

# Consolidate the dictionaries into a single dataframe
df_missing_weeks_nonboundary = pd.DataFrame()
for key, data in calendar_missing_weeks.items():
    # print(data)
    df_missing_weeks_nonboundary = df_missing_weeks_nonboundary.append(pd.concat([data]))

df_missing_weeks_nonboundary

# NOTE: This only outputs 1 record for 2022 because the 3rd missing week is on a boundary index - will need to account for this with different logic

In [None]:
# We create another dictionary for the exact dates that are missing
missing_dates = {}
for key, data in missing_weeks.items():
    date_pulled_key = df_merge['Date Pulled Year'].astype(str)==key
    week_number_null = df_merge['Week Number'].isnull()
    # display(df_merge[(date_pulled_key) & (week_number_null)]['Date Pulled'].unique())
    missing_dates[f'{key}'] = pd.to_datetime(df_merge[(date_pulled_key) & (week_number_null)]['Date Pulled'].unique())

# print(missing_dates.keys())
# print(missing_dates['2022'])

# Consolidate dictionaries into single dataframe
df_missing_dates = pd.DataFrame()

for key, data in missing_dates.items():
    df_missing_dates = df_missing_dates.append(pd.concat([pd.DataFrame({'Year':key, 'Missing Date':data})]))

df_missing_dates.reset_index(drop=True, inplace=True)
df_missing_dates['Year'] = df_missing_dates['Year'].astype(str)
df_missing_dates

In [None]:
# List keys for all dictionaries created so far
print(missing_weeks.keys())
print(calendar_missing_weeks.keys())
print(missing_dates.keys())

In [None]:
# First see the output that results from passing the same key to all 3 dictionaries
# display(missing_weeks['2022'])
# display(calendar_missing_weeks['2022'])
display(missing_weeks['2021'])
display(missing_weeks['2022'])
display(missing_dates['2021'])
display(missing_dates['2022'])

In [None]:
# Need to try and get get the date check condition in this dataframe
df_missing_weeks_nonboundary['Original Date Pulled'] = ['','']
# df_missing_weeks_nonboundary.rename(columns={'Date Pulled':'Week Start'}, inplace=True)
# df_missing_weeks_nonboundary.rename(columns={'Original Date Pulled':'Date Pulled'}, inplace=True)
df_missing_weeks_nonboundary['Date Pulled Year'] = df_missing_weeks_nonboundary['Date Pulled Year'].astype(str)
df_missing_weeks_nonboundary

In [None]:
# Keep track of dataframes
display(df_missing_dates)
display(df_missing_weeks_nonboundary)

In [None]:
# Use the dfs to try and do lookup

start_cond = df_missing_weeks_nonboundary['Date Pulled'].iloc[0]
end_cond = df_missing_weeks_nonboundary['Week End'].iloc[0]
date_var = df_missing_dates[df_missing_dates['Year']=='2021']['Missing Date'].iloc[0]

# TEST 2021
if (date_var >= start_cond) & (date_var <= end_cond):
    df_missing_weeks_nonboundary['Original Date Pulled'].iloc[0] = date_var
    df_missing_weeks_nonboundary['Original Date Pulled'] = pd.to_datetime(df_missing_weeks_nonboundary['Original Date Pulled'])
    display(df_missing_weeks_nonboundary)
else:  
    print('Not within range')

# NEED TO FIGURE OUT HOW TO INCORPORATE INTO LOOP

In [None]:
# # Use the dfs to try and do lookup

# TEST WITH 2022 - HOW TO HANDLE 2 DATE VAR
# start_col = df_missing_weeks_nonboundary['Date Pulled']
# end_col = df_missing_weeks_nonboundary['Week End']

df_missing_weeks_nonboundary['Date Pulled Year'] = df_missing_weeks_nonboundary['Date Pulled Year'].astype(np.int64)

for i in df_missing_dates['Year'].unique():
    print(df_missing_dates[df_missing_dates['Year']==i]['Missing Date'])
    if df_missing_dates[df_missing_dates['Year']==i]['Missing Date'] >= pd.to_datetime(df_missing_weeks_nonboundary[df_missing_weeks_nonboundary['Date Pulled Year'==i]]['Date Pulled']):
        print('Nice')
    else:
        print('Not nice')

# end_cond = df_missing_weeks_nonboundary['Week End'].iloc[0]
# date_var = df_missing_dates[df_missing_dates['Year']=='2021']['Missing Date'].iloc[0]

# # TEST 2021
# if (date_var >= start_cond) & (date_var <= end_cond):
#     df_missing_weeks_nonboundary['Original Date Pulled'].iloc[0] = date_var
#     df_missing_weeks_nonboundary['Original Date Pulled'] = pd.to_datetime(df_missing_weeks_nonboundary['Original Date Pulled'])
#     display(df_missing_weeks_nonboundary)
# else:  
#     print('Not within range')

# # NEED TO FIGURE OUT HOW TO INCORPORATE INTO LOOP