In [59]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import random as rand

In [223]:
# Get 16 random pages from the 72 available
pages = [str(x) for x in rand.sample(range(1,73), 16)]

In [224]:
# Get 25 stolen items per page from sample
params = [{"pageSize" : "25", "page" : x} for x in pages]
base_url = "https://api.fbi.gov/@artcrimes"
rs = [requests.get(base_url, param) for param in params]

In [261]:
# Concatenate all json items dicts from sampled pages
thievery = pd.concat([pd.DataFrame(r.json()['items']) for r in rs], ignore_index=True)

In [262]:
# Keep only columns of interest
thievery = thievery.loc[:,['title', 'maker', 'crimeCategory', 'materials', 'period', 'measurements']]

In [None]:
# Crime category: standardize description separation method
thievery['crimeCategory'].replace(',', '-', regex=True, inplace=True)

# Special cases 
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "dollsandfigurines", "dolls-and-figurines", thievery['crimeCategory'])
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "stringinstruments", "string-instruments", thievery['crimeCategory'])

# Check value counts for other special cases (won't worry about converting None to NA)
dict(thievery['crimeCategory'].value_counts())

In [None]:
# Materials category: standardize to propercase
thievery['materials'] = thievery['materials'].str.title()

# Check that str.title worked, won't change other than that (don't know enough about art)
dict(thievery['materials'].value_counts())

In [231]:
# Period: group by century when able, move to other category if not

# 1. standardize to lowercase
thievery['period'] = thievery['period'].str.lower()

# 2. take out qualifiers within century (mid, late, early)
thievery['period'] = thievery['period'].str.replace(r'(mid\s)|(late\s)|(early\s)', '', regex=True)

# 3. move misc. and date ranges that cover multiple centuries to other category
# thievery['period'] = np.where

# 4. move years to their respective centuries (treat circa as exact year)
# FOR LOOP OVER CENTURIES

In [236]:
dict(thievery['period'].value_counts())

{'20th century': 50,
 '13th century': 6,
 '18th century': 6,
 '17th century': 6,
 '19th century': 5,
 '1989': 5,
 '1983': 5,
 '1986': 5,
 '1980': 4,
 '1974': 4,
 'circa 1935': 4,
 '1999': 3,
 '1979': 3,
 '1977': 3,
 '1599': 3,
 '1972': 3,
 '1940': 3,
 '1969': 3,
 '1997': 3,
 "90's": 3,
 '1991': 3,
 '1985': 2,
 '1600-1800': 2,
 '1890': 2,
 '1914': 2,
 '1950': 2,
 '1967': 2,
 '1935': 2,
 '1971': 2,
 '2000': 2,
 '1960': 2,
 '1950-1970': 2,
 '1992': 2,
 '1984': 2,
 '600 ad': 2,
 '1990s': 2,
 '1965-1969': 2,
 '10 in': 1,
 '1890-1905': 1,
 '1954': 1,
 'modern': 1,
 '1976': 1,
 '1970': 1,
 '1927': 1,
 '1995-1999': 1,
 '1855': 1,
 '1807': 1,
 '1920s': 1,
 'pre-classic': 1,
 '1951': 1,
 '1996': 1,
 '1524-1606': 1,
 'circa 1950': 1,
 '1650': 1,
 '1801-1828': 1,
 '1124': 1,
 '1829': 1,
 '1115': 1,
 '16th-17th century': 1,
 '1978': 1,
 'copyright 1985': 1,
 '1644-1646': 1,
 'circa 1951': 1,
 '1612': 1,
 '1900-1965': 1,
 '1200-1300': 1,
 '2008': 1,
 'prior to 1934': 1,
 '1906-1907': 1,
 '1900-1966'

In [176]:
thievery['period'] = np.where(thievery['period'].str.match('.*19\d{2}'), '20th century', thievery['period'])

In [177]:
thievery['period'] = np.where(thievery['period'].str.match('.*18\d{2}'), '19th century', thievery['period'])

In [178]:
thievery['period'] = np.where(thievery['period'].str.match('.*20\d{2}'), '21st century', thievery['period'])

In [None]:
thievery['period'] = np.where(thievery['period'].str.match('.*20\d{2}'), '21st century', thievery['period'])

In [237]:
thievery.columns

Index(['title', 'maker', 'crimeCategory', 'materials', 'period',
       'measurements'],
      dtype='object')

In [None]:
# Maker: combine unknown and anonymous into one category
thievery['maker'] = np.where(thievery['maker'].str.match('(unknown)|(anon.*?)', flags=re.IGNORECASE), "Unknown", thievery['maker'])

# Check that it worked
dict(thievery['maker'].value_counts())

In [268]:
# measurements: get size of piece
list(thievery['measurements'])

[None,
 '30 x 24 in',
 None,
 '21 x 30 in',
 '19 x 24 in',
 '23 1/8 x 19 1/8 in',
 '36 x 39 in, frame',
 '27 x 24 x 9.5 in',
 'Height: 5 in.; Diameter: 11 in.',
 '109.20 x 151.20 cm',
 '22 x 18 in',
 '20 x 30 in',
 '4.5 x 5.75 in',
 '12 in',
 '41.00 x 28.00 cm',
 '60.00 x 81.00 cm',
 '23 7/8 in',
 '25 x 34 in',
 '16 x 8 in',
 '27 x 40 in',
 None,
 '4.60 x 2.20 cm',
 '6 in',
 '56.50 x 76.30 cm',
 'Diameter: 3 3/8 in.',
 '19 x 24 in, frame: 26 x 31 in',
 '40 x 30 in',
 '24 x 30 in',
 None,
 '21 x 16.5 in',
 '71.10 x 104.10 cm',
 '5" height x 4.5" square',
 '16 x 12 in',
 '48.5 x 35.75 in',
 None,
 'y: 5.25 in',
 'Height: 30 in; Width: 40 in.',
 None,
 None,
 None,
 None,
 '26 x 34 in',
 '12 x 16 in',
 '12 x 9 in',
 '8 x 7 in',
 '11 x 10 in',
 None,
 '83" x 68.5"',
 '73.75 x 108.12 cm',
 '21.50 x 13.00 cm',
 '19.00 cm',
 '16 x 20 in',
 'Early 17th Century',
 None,
 '26 x 36 in',
 '26 x 34 in',
 None,
 None,
 '33 x 25 3/16 in',
 None,
 '18 in',
 'Height: 24 cm; Length: 21 cm; Width: 10 cm'