In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import yfinance as yf
from pandas_datareader import data as pdr

In [None]:
#read data
sets = pd.read_csv('sets.csv', sep=";")
themes = pd.read_csv('themes.csv')

In [None]:
#merge data into one dataframe
sets_themes = pd.merge(sets, themes, how='left', left_on='theme_id', right_on='id', suffixes=('_sets', '_themes'))
sets_themes = sets_themes.drop(['id', 'parent_id'], axis='columns')

In [None]:
# Number of sets released per year
sets_by_year = sets_themes.groupby("year").size().reset_index(name="set_num")

# Plot actual data
plt.plot(sets_by_year["year"][:-1], sets_by_year["set_num"][:-1], label="Actual")

# Add trend line
trendline = np.polyfit(sets_by_year["year"], sets_by_year["set_num"], 2)
p = np.poly1d(trendline)
plt.plot(sets_by_year["year"], p(sets_by_year["year"]), "r--", label="Linia trendu")

# Configure plot
plt.legend()
plt.title('Liczba wydawanych zestawów LEGO rok do roku')
plt.xlabel("Rok")
plt.ylabel("Liczba zestawów LEGO")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Sets released per year (table)
sets_by_year_table = sets_themes.groupby("year").count()
sets_by_year_table["set_num"][:-1]

In [None]:
# Number of themes by year
themes_by_year = sets_themes.groupby("year")["theme_id"].nunique().reset_index(name="nr_themes")

# Calculate the trend line
z = np.polyfit(themes_by_year["year"].values[:-1], themes_by_year["nr_themes"].values[:-1], 2)
p = np.poly1d(z)

# Plot the data points and the trend line
plt.plot(themes_by_year["year"].values[:-1], themes_by_year["nr_themes"].values[:-1], label="Actual")
plt.plot(themes_by_year["year"].values[:-1], p(themes_by_year["year"].values[:-1]), "r--", label="Linia trendu")

# Configure the plot
plt.title('Liczba wydawanych serii tematycznych LEGO rok do roku')
plt.xlabel('Rok')
plt.ylabel('Liczba serii tematycznych')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Complexity trend in LEGO sets per year
# Group the data by year and calculate the average number of parts per set
parts_per_set = sets.groupby('year')['num_parts'].mean()

# Create a scatter plot of the data
plt.scatter(parts_per_set.index[:-1], parts_per_set.values[:-1])

# Calculate the slope and intercept of the best fit line
slope, intercept, r_value, p_value, std_err = stats.linregress(parts_per_set.index[:-2], parts_per_set.values[:-2])

# Calculate the x and y values for the best fit line
x = np.array(parts_per_set.index[:-1])
y = intercept + slope * x

# Plot the best fit line
plt.plot(x, y, 'r--', label='Linia trendu')

# Add a legend to the plot
plt.legend(loc='upper left')

# Show the plot
plt.title('Złozoność zestawów LEGO')
plt.xlabel('Rok')
plt.ylabel('Średnia liczba elementów w zestawie')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#biggest sets ever per number of parts
sets_themes.sort_values("num_parts", ascending=False).head()

In [None]:
#filer database
sets_themes = sets_themes.sort_values(by='year')
sets_themes = sets_themes[(sets_themes['year'] >= 1990) & (sets_themes['year'] <= 2023)]
sets_themes = sets_themes[(sets_themes['num_parts'] >= 25)]

#22 most popular themes
themes_to_keep = [252, 610, 52, 621, 22, 608, 579, 494, 246, 721, 603, 576, 577, 155, 693, 435, 601, 158, 684, 596, 690, 453]
sets_themes = sets_themes[sets_themes['theme_id'].isin(themes_to_keep)]

#reset index (number rows ascending)
sets_themes = sets_themes.reset_index(drop=True)

In [None]:
#getting prices from Brickeconomy.com
#save as csv
sets_themes.to_csv('brickeconomy.csv', index=False)
brickeconomy = pd.read_csv('brickeconomy.csv')

#prepare csv for brickeconomy format
brickeconomy = brickeconomy.drop(['name_sets','year','theme_id','num_parts','name_themes'], axis='columns')
brickeconomy['quantity']=1
brickeconomy.to_csv('brickeconomy.csv', index=False)

#read downloaded csv with prices and merge with dataframe
prices = pd.read_csv('prices.csv', sep=";")
df = pd.merge(sets_themes, prices, how='left', left_on='set_num', right_on='Number')
df = df.drop(['Number', 'Name'], axis='columns')

In [None]:
#filter out rows where any value is missing
df = df.dropna()

#change datatypes
df = df.astype({'Retail': float, 'Value': float})

#filter out rows where Retail is 0
df = df[(df['Retail'] != 0)]

#filter out sets not currently retired
df['Retired'] = df['Retired_date'].str[6:].astype(int)
df = df[(df['Retired'] <= 2023)]
df = df.drop(['Retired'], axis=1)

#reset index
df = df.reset_index(drop=True)
df