In [1]:
import pandas as pd

In [3]:
df_train = pd.read_csv('../../raw_data/egg_sales/train_egg_sales.csv', sep=';')
df_test = pd.read_csv('../../raw_data/egg_sales/test_egg_sales.csv')
df_train.head()

Unnamed: 0,Date,Egg Sales
0,1993-01-01,91
1,1993-01-02,124
2,1993-01-03,112
3,1993-01-04,115
4,1993-01-05,107


In [4]:
import pandas as pd

# Load train data (has values)
df_train = df_train.rename(columns={'Date': 'ts', 'Egg Sales': 'egg_sales'})
df_train['ts'] = pd.to_datetime(df_train['ts'])
df_train = df_train.sort_values('ts')

# Load test data (only dates, no values)
df_test = df_test.rename(columns={'Date': 'ts'})
df_test['ts'] = pd.to_datetime(df_test['ts'])
df_test = df_test.sort_values('ts')
df_test['egg_sales'] = pd.NA  # No values in test

# Merge train and test
df = pd.concat([df_train, df_test], ignore_index=True)
df = df.sort_values('ts').reset_index(drop=True)

# Check for issues
missing_values = df.isnull().sum()
time_deltas = df['ts'].diff()
most_common_freq = time_deltas.mode()[0]
duplicate_timestamps = df['ts'].duplicated().sum()
irregular_intervals = df[time_deltas != most_common_freq]

print("Missing values per column:\n", missing_values)
print("Most common frequency between timestamps:", most_common_freq)
print("Number of duplicate timestamps:", duplicate_timestamps)
print("Number of irregular intervals:", irregular_intervals.shape[0])

# Save cleaned data
df.to_csv('../../transformed_data/cleaned_egg_sales.csv', index=False)


Missing values per column:
 ts             0
egg_sales    358
dtype: int64
Most common frequency between timestamps: 1 days 00:00:00
Number of duplicate timestamps: 0
Number of irregular intervals: 1


In [5]:
df.head()

Unnamed: 0,ts,egg_sales
0,1993-01-01,91
1,1993-01-02,124
2,1993-01-03,112
3,1993-01-04,115
4,1993-01-05,107


We don't have test sales, that's why test values are missing

In [6]:
df.tail()

Unnamed: 0,ts,egg_sales
10945,2022-12-20,
10946,2022-12-21,
10947,2022-12-22,
10948,2022-12-23,
10949,2022-12-24,
