In [189]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [190]:
df = pd.read_csv("lego_data/Data/lego_cleaned.csv", sep = ",", encoding = "latin1")
df

Unnamed: 0,Set_Name,Theme,Pieces,Price,Pages,Unique_Pieces
0,Creative Blue Bricks,Classic,52.0,4.99,37.0,28.0
1,Creative Green Bricks,Classic,60.0,4.99,37.0,36.0
2,Fire Truck,DUPLO,6.0,6.99,3.0,6.0
3,Tow Truck,DUPLO,7.0,6.99,3.0,7.0
4,Stephanie's Summer Heart Box,Friends,95.0,7.99,40.0,52.0
...,...,...,...,...,...,...
917,Welcome to Apocalypseburg!,THE LEGO MOVIE 2,3178.0,299.99,452.0,692.0
918,Jurassic Park: T. rex Rampage,Jurassic World,3120.0,249.99,464.0,525.0
919,Monkie Kid's Team Secret HQ,Monkie Kid,1105.0,169.99,556.0,622.0
920,Grand Piano,Ideas,3662.0,349.99,564.0,345.0


In [191]:
# Define keywords for transport and building
transport_keywords = [
    'car', 'truck', 'bus', 'train', 'plane', 'helicopter', 'boat', 'ship', 'submarine', 'bike', 'motorcycle', 
    'scooter', 'ambulance', 'fire truck', 'police car', 'taxi', 'tractor', 'bulldozer', 'crane', 'forklift', 
    'spaceship', 'rocket', 'hovercraft', 'jet', 'yacht', 'canoe', 'kayak', 'glider', 'tanker', 'ferry', 
    'limousine', 'van', 'pickup', 'wagon', 'cart', 'gondola', 'shuttle', 'trolley', 'cab', 'sedan', 'coupe', 
    'convertible', 'roadster', 'minivan', 'SUV', 'RV', 'camper', 'trailer', 'snowmobile', 'ATV', 'golf cart', 
    'skateboard', 'rollerblades', 'hoverboard', 'segway', 'unicycle', 'monorail', 'zeppelin', 'blimp', 'dirigible', 
    'drone', 'UAV', 'quad bike', 'dirt bike', 'speedboat', 'dinghy', 'lifeboat', 'rescue boat', 'patrol boat', 
    'fishing boat', 'sailboat', 'catamaran', 'trimaran', 'hydrofoil', 'jet ski', 'water scooter', 'snowcat', 
    'snowplow', 'snow groomer', 'icebreaker', 'hovercraft', 'amphibious vehicle', 'submersible', 'diving bell', 
    'bathyscaphe', 'ROV', 'AUV', 'glider', 'hang glider', 'paraglider', 'paramotor', 'microlight', 'ultralight',
    'lamborghini', 'porsche', 'ferrari', 'mobile', 'microfighter', 'RC', 'remote control', 'speeder', 'buggy',
    'kart', 'racer', 'racing', 'dragster', 'hot rod', 'chopper', 'harley', 'motorbike', 'motorbike', 'motorbike',
    'mclaren', 'bugatti','audi', 'bmw', 'mercedes', 'volkswagen', 'ford', 'starfighter', 'fighter', 'bomber',
    'interceptor', 'airliner', 'aircraft', 'airplane', 'jetliner', 'biplane', 'seaplane', 'helicopter', 'chopper',
    'cargo plane', 'fighter jet', 'bomber jet', 'stealth jet', 'space shuttle', 'spacecraft', 'rocket', 'shuttle',
    'chevrolet', 'cadillac', 'buick', 'dodge', 'chrysler', 'jeep', 'ram', 'tesla', 'nissan', 'toyota', 'honda',
    'AAT', 'AT-AT', 'AT-ST', 'snowspeeder', 'speeder bike', 'podracer', 'landspeeder', 'starship', 'star destroyer',
    'wing'

]

building_keywords = [
    'house', 'home', 'apartment', 'condo', 'mansion', 'villa', 'cottage', 'bungalow', 'chalet', 'castle', 'fortress', 
    'palace', 'tower', 'skyscraper', 'office', 'building', 'hospital', 'school', 'university', 'college', 'library', 
    'museum', 'church', 'cathedral', 'temple', 'mosque', 'synagogue', 'shrine', 'monastery', 'abbey', 'convent', 
    'chapel', 'basilica', 'pagoda', 'stupa', 'pyramid', 'ziggurat', 'lighthouse', 'windmill', 'barn', 'stable', 
    'shed', 'garage', 'warehouse', 'factory', 'plant', 'mill', 'workshop', 'studio', 'lab', 'laboratory', 'clinic', 
    'pharmacy', 'store', 'shop', 'market', 'mall', 'plaza', 'center', 'stadium', 'arena', 'gym', 'gymnasium', 
    'theater', 'cinema', 'auditorium', 'hall', 'court', 'station', 'terminal', 'depot', 'port', 'harbor', 'dock', 
    'pier', 'wharf', 'jetty', 'marina', 'boathouse', 'boatyard', 'shipyard', 'dry dock', 'slipway', 'quay', 'quayside',
    'resort', 'HQ'
]

# Function to categorize build type
def categorize_build_type(set_name):
    set_name_lower = set_name.lower()
    if any(keyword in set_name_lower for keyword in transport_keywords):
        return 'transport'
    elif any(keyword in set_name_lower for keyword in building_keywords):
        return 'building'
    else:
        return 'other'

# Apply the function to create the new column
df['build_type'] = df['Set_Name'].apply(categorize_build_type)

df

Unnamed: 0,Set_Name,Theme,Pieces,Price,Pages,Unique_Pieces,build_type
0,Creative Blue Bricks,Classic,52.0,4.99,37.0,28.0,other
1,Creative Green Bricks,Classic,60.0,4.99,37.0,36.0,other
2,Fire Truck,DUPLO,6.0,6.99,3.0,6.0,transport
3,Tow Truck,DUPLO,7.0,6.99,3.0,7.0,transport
4,Stephanie's Summer Heart Box,Friends,95.0,7.99,40.0,52.0,other
...,...,...,...,...,...,...,...
917,Welcome to Apocalypseburg!,THE LEGO MOVIE 2,3178.0,299.99,452.0,692.0,other
918,Jurassic Park: T. rex Rampage,Jurassic World,3120.0,249.99,464.0,525.0,transport
919,Monkie Kid's Team Secret HQ,Monkie Kid,1105.0,169.99,556.0,622.0,other
920,Grand Piano,Ideas,3662.0,349.99,564.0,345.0,other


In [192]:
build_type_counts = df['build_type'].value_counts()
print(build_type_counts)

other        530
transport    281
building     111
Name: build_type, dtype: int64


In [193]:
select_build_type = df[df['build_type'] == 'other']['Set_Name']
print(select_build_type)

# write all other to own csv
select_build_type.to_csv("lego_data/Data/lego_other.csv", index = False)


0              Creative Blue Bricks
1             Creative Green Bricks
4      Stephanie's Summer Heart Box
5                   White Baseplate
7                        Woody & RC
                   ...             
910        Monkey King Warrior Mech
912                Destiny's Bounty
917      Welcome to Apocalypseburg!
919     Monkie Kid's Team Secret HQ
920                     Grand Piano
Name: Set_Name, Length: 530, dtype: object
