In [349]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from pathlib import Path
import yfinance as yf
from pandas_datareader import data as pdr

In [350]:
#paths
csv_path = Path() / "csv"
plots_path = Path() / "plots"

In [351]:
df = pd.read_csv(csv_path / 'BrickEconomy-sets.csv')

In [352]:
df = df.drop(['Subtheme', 'Paid', 'Growth', 'Condition', 'Date', 'Notes', 'Collection', 'Status', 'URL'], axis='columns')


df.to_csv(csv_path / 'brickeconomy_new.csv')

In [353]:
df.columns = [col.lower() for col in df.columns]
df.rename(columns={'releaseddate': 'released_date'}, inplace=True)
df.rename(columns={'retireddate': 'retired_date'}, inplace=True)

df

Unnamed: 0,number,name,theme,year,pieces,minifigs,availability,retired,released_date,retired_date,retail,value
0,75144-1,Snowspeeder,Star Wars,2017,1703,2,Exclusive,True,05/05/2017,15/01/2019,199;99 €,330;40 €
1,10251-1,Brick Bank,Icons,2016,2380,6,RetailLimited,True,02/01/2016,14/11/2018,149;99 €,494;81 €
2,10252-1,Volkswagen Beetle,Icons,2016,1167,0,Exclusive,True,01/08/2016,03/12/2020,89;99 €,113;14 €
3,10253-1,Big Ben,Icons,2016,4163,0,Exclusive,True,02/07/2016,25/11/2018,219;99 €,327;12 €
4,10702-1,Creative Building Set,Classic,2016,583,0,Retail,True,02/01/2016,24/11/2017,24;99 €,30;14 €
...,...,...,...,...,...,...,...,...,...,...,...,...
17614,662403-1,Zombie with Burning Baby Zombie and TNT,Minecraft,2024,12,2,Promotional,True,01/01/2024,01/02/2024,0;00 €,4;73 €
17615,9790-1,ROBOLAB Team Challenge Set,Education,1999,725,0,Retail,True,01/01/1999,01/01/2001,175;99 €,193;78 €
17616,2000443-1,Workshop Kit Freewheeler,Education,2015,82,1,RetailLimited,True,01/01/2015,01/01/2017,8;79 €,11;00 €
17617,2000442-1,Workshop Kit Spinning Top,Education,2015,10,0,Retail,True,01/01/2015,01/01/2017,8;79 €,23;77 €


In [354]:
# Check for missing values in the dataframe
df.isnull().sum()

number            0
name              1
theme             0
year              0
pieces            0
minifigs          0
availability      0
retired           0
released_date    19
retired_date     15
retail            0
value             0
dtype: int64

In [355]:
# Function to clean and convert currency values
def clean_currency(value):
    try:
        # Remove the currency symbol and replace ';' with '.'
        value = value.replace('€', '').replace(';', '.').strip()
        # Convert to float
        return float(value)
    except ValueError:
        # Return None if conversion is not possible
        return None

# Apply the function to the 'retail' and 'value' columns
df['retail'] = df['retail'].apply(clean_currency)
df['value'] = df['value'].apply(clean_currency)

# Drop rows where conversion to float was not possible
df = df.dropna(subset=['retail', 'value'])

In [356]:
df

Unnamed: 0,number,name,theme,year,pieces,minifigs,availability,retired,released_date,retired_date,retail,value
0,75144-1,Snowspeeder,Star Wars,2017,1703,2,Exclusive,True,05/05/2017,15/01/2019,199.99,330.40
1,10251-1,Brick Bank,Icons,2016,2380,6,RetailLimited,True,02/01/2016,14/11/2018,149.99,494.81
2,10252-1,Volkswagen Beetle,Icons,2016,1167,0,Exclusive,True,01/08/2016,03/12/2020,89.99,113.14
3,10253-1,Big Ben,Icons,2016,4163,0,Exclusive,True,02/07/2016,25/11/2018,219.99,327.12
4,10702-1,Creative Building Set,Classic,2016,583,0,Retail,True,02/01/2016,24/11/2017,24.99,30.14
...,...,...,...,...,...,...,...,...,...,...,...,...
17614,662403-1,Zombie with Burning Baby Zombie and TNT,Minecraft,2024,12,2,Promotional,True,01/01/2024,01/02/2024,0.00,4.73
17615,9790-1,ROBOLAB Team Challenge Set,Education,1999,725,0,Retail,True,01/01/1999,01/01/2001,175.99,193.78
17616,2000443-1,Workshop Kit Freewheeler,Education,2015,82,1,RetailLimited,True,01/01/2015,01/01/2017,8.79,11.00
17617,2000442-1,Workshop Kit Spinning Top,Education,2015,10,0,Retail,True,01/01/2015,01/01/2017,8.79,23.77


In [357]:
# Function to convert date format and handle invalid dates
def convert_date_format(date_str):
    try:
        return pd.to_datetime(date_str, format="%d/%m/%Y")
    except ValueError:
        return pd.NaT

# Apply the function to the date columns using .loc to avoid the SettingWithCopyWarning
df.loc[:, 'released_date'] = df['released_date'].apply(convert_date_format)
df.loc[:, 'retired_date'] = df['retired_date'].apply(convert_date_format)

# Remove rows with invalid dates
df = df.dropna(subset=['released_date', 'retired_date'])

In [358]:
df = df.reset_index()

df

Unnamed: 0,index,number,name,theme,year,pieces,minifigs,availability,retired,released_date,retired_date,retail,value
0,0,75144-1,Snowspeeder,Star Wars,2017,1703,2,Exclusive,True,2017-05-05 00:00:00,2019-01-15 00:00:00,199.99,330.40
1,1,10251-1,Brick Bank,Icons,2016,2380,6,RetailLimited,True,2016-01-02 00:00:00,2018-11-14 00:00:00,149.99,494.81
2,2,10252-1,Volkswagen Beetle,Icons,2016,1167,0,Exclusive,True,2016-08-01 00:00:00,2020-12-03 00:00:00,89.99,113.14
3,3,10253-1,Big Ben,Icons,2016,4163,0,Exclusive,True,2016-07-02 00:00:00,2018-11-25 00:00:00,219.99,327.12
4,4,10702-1,Creative Building Set,Classic,2016,583,0,Retail,True,2016-01-02 00:00:00,2017-11-24 00:00:00,24.99,30.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17305,17614,662403-1,Zombie with Burning Baby Zombie and TNT,Minecraft,2024,12,2,Promotional,True,2024-01-01 00:00:00,2024-02-01 00:00:00,0.00,4.73
17306,17615,9790-1,ROBOLAB Team Challenge Set,Education,1999,725,0,Retail,True,1999-01-01 00:00:00,2001-01-01 00:00:00,175.99,193.78
17307,17616,2000443-1,Workshop Kit Freewheeler,Education,2015,82,1,RetailLimited,True,2015-01-01 00:00:00,2017-01-01 00:00:00,8.79,11.00
17308,17617,2000442-1,Workshop Kit Spinning Top,Education,2015,10,0,Retail,True,2015-01-01 00:00:00,2017-01-01 00:00:00,8.79,23.77
