In [96]:
# Import dependencies
import pandas as pd
import numpy as np

In [97]:
# Import and store the data into a DataFrame
df = pd.read_csv('lego_sets.csv')

df.head()

Unnamed: 0,set_id,name,year,theme,subtheme,themeGroup,category,pieces,minifigs,agerange_min,US_retailPrice,bricksetURL,thumbnailURL,imageURL
0,1-8,Small house set,1970,Minitalia,,Vintage,Normal,67.0,,,,https://brickset.com/sets/1-8,https://images.brickset.com/sets/small/1-8.jpg,https://images.brickset.com/sets/images/1-8.jpg
1,2-8,Medium house set,1970,Minitalia,,Vintage,Normal,109.0,,,,https://brickset.com/sets/2-8,https://images.brickset.com/sets/small/2-8.jpg,https://images.brickset.com/sets/images/2-8.jpg
2,3-6,Medium house set,1970,Minitalia,,Vintage,Normal,158.0,,,,https://brickset.com/sets/3-6,https://images.brickset.com/sets/small/3-6.jpg,https://images.brickset.com/sets/images/3-6.jpg
3,4-4,Large house set,1970,Minitalia,,Vintage,Normal,233.0,,,,https://brickset.com/sets/4-4,https://images.brickset.com/sets/small/4-4.jpg,https://images.brickset.com/sets/images/4-4.jpg
4,4-6,Mini House and Vehicles,1970,Samsonite,Model Maker,Vintage,Normal,,,,,https://brickset.com/sets/4-6,,


In [98]:
# Check the number of rows and columns in the DataFrame
print(df.shape)
print(f'\nThere are {df.shape[0]} rows and {df.shape[1]} columns in the DataFrame.')

(18457, 14)

There are 18457 rows and 14 columns in the DataFrame.


In [99]:
# Check the count of non-null values and the data type of each column
df.info()

# Rename the columns
df.rename(columns={'set_id':'Set ID', 
                   'name':'Name', 
                   'year':'Year', 
                   'theme': 'Theme', 
                   'subtheme':'Sub-Theme', 
                   'themegroup':'Theme Group', 
                   'category':'Category', 
                   'pieces':'Piece Count', 
                   'minifigs': 'Minifig Count', 
                   'agerange_min': 'Age Range (Min)', 
                   'US_retailPrice': 'US Retail Price',
                   'bricksetURL': 'Brick Set URL', 
                   'thumbnailURL':'Thumbnail URL', 
                   'imageURL':'Image URL'}, inplace=True)

print(f'\n\nColumn "{df.columns[7]}" is in "{df[df.columns[7]].dtype}" and needs to be converted to "Integer".')
print(f'Column "{df.columns[9]}" is in "{df[df.columns[9]].dtype}" and needs to be converted to "Integer".')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18457 entries, 0 to 18456
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   set_id          18457 non-null  object 
 1   name            18457 non-null  object 
 2   year            18457 non-null  int64  
 3   theme           18457 non-null  object 
 4   subtheme        14901 non-null  object 
 5   themeGroup      18455 non-null  object 
 6   category        18457 non-null  object 
 7   pieces          14533 non-null  float64
 8   minifigs        8399 non-null   float64
 9   agerange_min    6787 non-null   float64
 10  US_retailPrice  6982 non-null   float64
 11  bricksetURL     18457 non-null  object 
 12  thumbnailURL    17451 non-null  object 
 13  imageURL        17451 non-null  object 
dtypes: float64(4), int64(1), object(9)
memory usage: 2.0+ MB


Column "Piece Count" is in "float64" and needs to be converted to "Integer".
Column "Age Range (Min)" is in

In [100]:
# Take a deeper look at the values in 'Piece Count'
print(df['Piece Count'].value_counts(ascending=False).head(10))

# In order to change the data type of the column, we need to fill NaN with values first
df['Piece Count'] = df['Piece Count'].fillna(-100)

# Now, we can convert the data type into Integer
df['Piece Count'] = df['Piece Count'].astype(int)

print(f"\nData type has been converted to Integer.")

6.0     341
1.0     309
4.0     268
5.0     256
7.0     249
8.0     225
2.0     199
9.0     176
10.0    171
25.0    140
Name: Piece Count, dtype: int64

Data type has been converted to Integer.


In [101]:
# Change the pseudo-values back to NaN
df['Piece Count'] = df['Piece Count'].replace(-100, np.nan)

In [102]:
# Take a deeper look at the values in 'Minifig Count'
print(df['Minifig Count'].value_counts(ascending=False).head(10))

# In order to change the data type of the column, we need to fill NaN with values first
df['Minifig Count'] = df['Minifig Count'].fillna(-100)

# Now, we can convert the data type into Integer
df['Minifig Count'] = df['Minifig Count'].astype(int)

print(f"\nData type has been converted to Integer.")

1.0     3772
2.0     1670
3.0     1058
4.0      733
5.0      381
6.0      259
7.0      149
8.0      110
9.0       55
10.0      40
Name: Minifig Count, dtype: int64

Data type has been converted to Integer.


In [103]:
# Change the pseudo-values back to NaN
df['Minifig Count'] = df['Minifig Count'].replace(-100, np.nan)

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18457 entries, 0 to 18456
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Set ID           18457 non-null  object 
 1   Name             18457 non-null  object 
 2   Year             18457 non-null  int64  
 3   Theme            18457 non-null  object 
 4   Sub-Theme        14901 non-null  object 
 5   themeGroup       18455 non-null  object 
 6   Category         18457 non-null  object 
 7   Piece Count      14533 non-null  float64
 8   Minifig Count    8399 non-null   float64
 9   Age Range (Min)  6787 non-null   float64
 10  US Retail Price  6982 non-null   float64
 11  Brick Set URL    18457 non-null  object 
 12  Thumbnail URL    17451 non-null  object 
 13  Image URL        17451 non-null  object 
dtypes: float64(4), int64(1), object(9)
memory usage: 2.0+ MB


In [6]:
df.describe()

Unnamed: 0,year,pieces,minifigs,agerange_min,US_retailPrice
count,18457.0,14533.0,8399.0,6787.0,6982.0
mean,2007.960611,226.473749,2.66365,6.637542,37.534817
std,11.948666,469.988785,2.897857,2.780091,54.382712
min,1970.0,0.0,1.0,1.0,1.49
25%,2001.0,23.0,1.0,5.0,9.99
50%,2011.0,70.0,2.0,6.0,19.99
75%,2017.0,242.0,3.0,8.0,39.99
max,2022.0,11695.0,80.0,18.0,849.99
