In [1]:
from pathlib import Path
import pandas as pd

In [2]:
# Paths
csv_path = Path() / "csv"
plots_path = Path() / "plots"

In [3]:
df = pd.read_csv(csv_path / 'lego_to_be_cleaned.csv')

In [4]:
df.head(2)

Unnamed: 0,Number,Name,Theme,Subtheme,Year,Pieces,Minifigs,Availability,Retired,ReleasedDate,...,Retail,Paid,Value,Growth,Condition,Date,Notes,Collection,Status,URL
0,75144-1,Snowspeeder,Star Wars,Ultimate Collector Series,2017,1703,2,Exclusive,True,05/05/2017,...,199;99 €,199;99 €,330;40 €,65.21,New,,,Default,Owned,https://www.brickeconomy.com/set/75144-1/lego-...
1,10251-1,Brick Bank,Icons,Modular Buildings,2016,2380,6,RetailLimited,True,02/01/2016,...,149;99 €,149;99 €,494;81 €,229.89,New,,,Default,Owned,https://www.brickeconomy.com/set/10251-1/lego-...


#### Data format cleaning
Data in a file is stored not in a proper way - time to change it!

In [5]:
# Delete all not wanted columns
df = df.drop(['Subtheme', 'Paid', 'Growth', 'Condition', 'Date', 'Notes', 'Collection', 'Status', 'URL'], axis='columns')

In [6]:
# All column names should start with a lowerletter and have '_' where space is required
df.columns = [col.lower() for col in df.columns]
df.rename(columns={'releaseddate': 'released_date'}, inplace=True)
df.rename(columns={'retireddate': 'retired_date'}, inplace=True)

df.head()

Unnamed: 0,number,name,theme,year,pieces,minifigs,availability,retired,released_date,retired_date,retail,value
0,75144-1,Snowspeeder,Star Wars,2017,1703,2,Exclusive,True,05/05/2017,15/01/2019,199;99 €,330;40 €
1,10251-1,Brick Bank,Icons,2016,2380,6,RetailLimited,True,02/01/2016,14/11/2018,149;99 €,494;81 €
2,10252-1,Volkswagen Beetle,Icons,2016,1167,0,Exclusive,True,01/08/2016,03/12/2020,89;99 €,113;14 €
3,10253-1,Big Ben,Icons,2016,4163,0,Exclusive,True,02/07/2016,25/11/2018,219;99 €,327;12 €
4,10702-1,Creative Building Set,Classic,2016,583,0,Retail,True,02/01/2016,24/11/2017,24;99 €,30;14 €


In [7]:
# Check for missing values in the dataframe
df.isnull().sum()

number            0
name              1
theme             0
year              0
pieces            0
minifigs          0
availability      0
retired           0
released_date    19
retired_date     15
retail            0
value             0
dtype: int64

In [8]:
# Function to clean and convert currency values
def clean_currency(value):
    try:
        # Remove the currency symbol and replace ';' with '.'
        value = value.replace('€', '').replace(';', '.').strip()
        # Convert to float
        return float(value)
    except ValueError:
        # Return None if conversion is not possible
        return None

# Apply the function to the 'retail' and 'value' columns
df['retail'] = df['retail'].apply(clean_currency)
df['value'] = df['value'].apply(clean_currency)

# Drop rows where conversion to float was not possible
df = df.dropna(subset=['retail', 'value']).reset_index(drop=True)

df.head()

Unnamed: 0,number,name,theme,year,pieces,minifigs,availability,retired,released_date,retired_date,retail,value
0,75144-1,Snowspeeder,Star Wars,2017,1703,2,Exclusive,True,05/05/2017,15/01/2019,199.99,330.4
1,10251-1,Brick Bank,Icons,2016,2380,6,RetailLimited,True,02/01/2016,14/11/2018,149.99,494.81
2,10252-1,Volkswagen Beetle,Icons,2016,1167,0,Exclusive,True,01/08/2016,03/12/2020,89.99,113.14
3,10253-1,Big Ben,Icons,2016,4163,0,Exclusive,True,02/07/2016,25/11/2018,219.99,327.12
4,10702-1,Creative Building Set,Classic,2016,583,0,Retail,True,02/01/2016,24/11/2017,24.99,30.14


In [9]:
# Function to convert date format and handle invalid dates
def convert_date_format(date_str):
    try:
        return pd.to_datetime(date_str, format="%d/%m/%Y")
    except ValueError:
        return pd.NaT

# Apply the function to the date columns using .loc to avoid the SettingWithCopyWarning
df.loc[:, 'released_date'] = df['released_date'].apply(convert_date_format)
df.loc[:, 'retired_date'] = df['retired_date'].apply(convert_date_format)

# Remove rows with invalid dates
df = df.dropna(subset=['released_date', 'retired_date']).reset_index(drop=True)

df.head()

Unnamed: 0,number,name,theme,year,pieces,minifigs,availability,retired,released_date,retired_date,retail,value
0,75144-1,Snowspeeder,Star Wars,2017,1703,2,Exclusive,True,2017-05-05 00:00:00,2019-01-15 00:00:00,199.99,330.4
1,10251-1,Brick Bank,Icons,2016,2380,6,RetailLimited,True,2016-01-02 00:00:00,2018-11-14 00:00:00,149.99,494.81
2,10252-1,Volkswagen Beetle,Icons,2016,1167,0,Exclusive,True,2016-08-01 00:00:00,2020-12-03 00:00:00,89.99,113.14
3,10253-1,Big Ben,Icons,2016,4163,0,Exclusive,True,2016-07-02 00:00:00,2018-11-25 00:00:00,219.99,327.12
4,10702-1,Creative Building Set,Classic,2016,583,0,Retail,True,2016-01-02 00:00:00,2017-11-24 00:00:00,24.99,30.14


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17310 entries, 0 to 17309
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   number         17310 non-null  object 
 1   name           17309 non-null  object 
 2   theme          17310 non-null  object 
 3   year           17310 non-null  int64  
 4   pieces         17310 non-null  int64  
 5   minifigs       17310 non-null  int64  
 6   availability   17310 non-null  object 
 7   retired        17310 non-null  bool   
 8   released_date  17310 non-null  object 
 9   retired_date   17310 non-null  object 
 10  retail         17310 non-null  float64
 11  value          17310 non-null  float64
dtypes: bool(1), float64(2), int64(3), object(6)
memory usage: 1.5+ MB


In [11]:
df.to_csv(csv_path / 'brickeconomy_new.csv')