In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import random as rand

# API Sampling

In [2]:
# Get 16 random pages from the 72 available
# pages = [str(x) for x in rand.sample(range(1,73), 16)]

# Keep this set to aid with data cleaning and analysis consistency
pages_ex = ['31','21','54','22','27','26','6','18','55','9','68','15','40','53','62','43']

In [3]:
# Get 25 stolen items per page from sample
params = [{"pageSize" : "25", "page" : x} for x in pages_ex]
base_url = "https://api.fbi.gov/@artcrimes"
rs = [requests.get(base_url, param) for param in params]

In [4]:
# Concatenate all json items dicts from sampled pages
thievery = pd.concat([pd.DataFrame(r.json()['items']) for r in rs], ignore_index=True)

In [5]:
# Keep only columns of interest
thievery = thievery.loc[:,['title', 'maker', 'crimeCategory', 'materials', 'period', 'measurements']]

# Data Cleaning

## Maker: Anonymous and Unknown are both unknown

In [6]:
# Combine unknown and anonymous into one category
thievery['maker'] = np.where(thievery['maker'].str.match('(unknown)|(anon.*?)', flags=re.IGNORECASE), "Unknown", thievery['maker'])

# Check for special cases (spelling, etc.)
# dict(thievery['maker'].value_counts())

## Crime Category: Punctuation doesn't matter

In [7]:
# Standardize description separation method
thievery['crimeCategory'].replace(',', '-', regex=True, inplace=True)

# Special cases 
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "dollsandfigurines", "dolls-and-figurines", thievery['crimeCategory'])
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "stringinstruments", "string-instruments", thievery['crimeCategory'])

# Check value counts for other special cases (won't worry about converting None to NA)
# dict(thievery['crimeCategory'].value_counts())

## Materials: Not case sensitive

In [8]:
# Standardize to propercase
thievery['materials'] = thievery['materials'].str.title()

# Check for special cases
# dict(thievery['materials'].value_counts())

## Period: Group by century

In [9]:
# 1. Standardize to lowercase
thievery['period'] = thievery['period'].str.lower()

# 2. Take out qualifiers within century (mid, late, early)
thievery['period'] = thievery['period'].str.replace(r'(mid\s)|(late\s)|(early\s)', '', regex=True)

# 3. Move misc. and date ranges that cover multiple centuries to other category
# a. Extract centuries from ranges to dataframe (e.g., 1850-1970 would return 18 and 19)
ranges = thievery['period'].str.extract(r'([0-9]{2}).*\-([0-9]{2}).*')

# b. For each range, return "__th century" if the range is within one century, send to other category if not
thievery['period'] = np.where(thievery['period'].str.contains('\-'), \
    np.where(ranges[0]==ranges[1], ranges[0]+'th century', 'Other'), \
    thievery['period'])

# 4. Move years to their respective centuries (treat circa as exact year)
# For loop over 13th-20th centuries (before 1200 it's unlikely to have a specific year; 21st century has unique suffix)
for i in range(12, 20):
    thievery['period'] = np.where(thievery['period'].str.match('.*?'+ str(i) + '\d{2}'), str(i+1) + 'th century', thievery['period'])

# Special cases (some hardcoding but oh well...), with BC specified but AD considered "default"
thievery['period'] = np.where(thievery['period'].str.match(r'.*20[0-9]{2}'), '21st century', thievery['period']) 
thievery['period'] = np.where(thievery['period'] == '19th c.', '19th century', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '600', '6th century', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '500 bc', '5th century bc', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '1st century ad', '1st century', thievery['period'])

# 5. Move remaining non-century periods to Other (era like contemporary, coded incorrectly, etc.)
thievery['period'] = np.where(thievery['period'].str.contains('century'), thievery['period'], 'Other')

# Check for special cases
# dict(thievery['period'].value_counts())

## Measurements: Forced quantitative variable
**($in$, $in^2$, or $in^3$)**

In [10]:
# 1. Take care of special cases
# entire strings (there are a lot, but I couldn't get everything in these with just regex)
thievery['measurements'].replace('Approximately 7.5 inches maximum width along blade; approximately 14 inches overall length', '14 in', inplace=True)
thievery['measurements'].replace('Sheet size: 53.00 x 35.00 cm; Print size: 35.00 x 27.00 cm', inplace=True)
thievery['measurements'].replace('Height: 7 5/8 in;  Base Diameter: 2.5 in; Max Diameter: 3 7/8 in; Rim Diameter: 3.5 in', '7.625 x Diameter: 2.5 in', inplace=True)
thievery['measurements'].replace('Height: 5 in\r\n\r\nDiameter: 4in.', '5 in x Diameter: 4 in', inplace=True)
thievery['measurements'].replace('Height: 7.5in; Width: 7in', '7.5 x 7 in', inplace=True)
thievery['measurements'].replace('2.88" x 2.88\'', '2.88 x 2.88 in', inplace=True)
thievery['measurements'].replace('Height: 108 in; Length: 11.5 in; Width: 1.5 in; Weight: 250 lbs', '108 x 11.5 x 1.5 in', inplace=True)
thievery['measurements'].replace('Height: 4in; Width: 11in', '4 x 11 in', inplace=True)
thievery['measurements'].replace('17 x 28in.', '17 x 28 in', inplace=True)
thievery['measurements'].replace('2.88" x 2.88\'', '2.88 x 2.88 in', inplace=True)
thievery['measurements'].replace('4\' 5"', '53 in', inplace=True)
thievery['measurements'].replace('Height: 16.5 in; Width: 12 in; Diameter: 7.5 in; Weight 5.55 lbs', '16.5 x 12 x Diameter: 7.5 in', inplace=True)
thievery['measurements'].replace('9.5 x 7 .5 in', '9.5 x 7.5 in', inplace=True)
thievery['measurements'].replace('20\'" x 24"', '20 x 24 in', inplace=True)
thievery['measurements'].replace('3 x 5 1/2 x 3/4 in', '3 x 5.5 x .75 in', inplace=True)

# not enough info to get length, area, or volume (data entered incorrectly or ambiguous measurements)
thievery['measurements'].replace('Base Diameter: 2.25"; Middle Diameter: 3.5"; Neck Diameter: 1.5 "; Top Diameter: 3.5"', 'NaN', inplace=True)
thievery['measurements'].replace('83.6 grams', 'NaN', inplace=True)
thievery['measurements'].replace('Height: 15.5 in.; Width: 19.5 in.; Diameter: 24 in.', 'NaN', inplace=True)
thievery['measurements'].replace('18.75 x 17 in, y: 11 in', 'NaN', inplace=True)
thievery['measurements'].replace('7.00 cm, y: 68 cm', 'NaN', inplace=True)
thievery['measurements'].replace('64 1/8 x 33 7/16 in, Y: 19 5/8 in', 'NaN', inplace=True)
thievery['measurements'].replace('Bowl', 'NaN', inplace=True)
thievery['measurements'].replace('Scroll', 'NaN', inplace=True)
thievery['measurements'].replace('Panel', 'NaN', inplace=True)
thievery['measurements'].replace('Not including frame', 'NaN', inplace=True)
thievery['measurements'].replace('Early 17th Century', 'NaN', inplace=True)

# 2. Standardize how units are reported to "__ x __ x __ in/cm"
# Change " to in
thievery['measurements'].replace('\"', ' in', regex=True, inplace=True)

# Change __; __ to __ x __
thievery['measurements'].replace(';', ' x', regex=True, inplace=True)

# Remove words/phrases besides in/cm/etc.
thievery['measurements'].replace(r'([Dd]epth:?\s)|((\sin\s)?[Hh]eight:?\s?)|([Ww]idth:?\s)|([Ll]ength:?\s)|(\s?\(?[Aa]pproximately\)?\s?)|(\,\sframe$)', '', regex=True, inplace=True)

# Remove in and cm except at end of string
thievery['measurements'].replace(r'(in\.?|cm\.?)\sx', 'x', regex=True, inplace=True)

# 3. Replace fractions with decimals, case-by-case (not the best method but best I can do for now)
thievery['measurements'].replace(r'\s1/8', '.125', regex=True, inplace=True)
thievery['measurements'].replace(r'\s3/16', '.1875', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/5', '.2', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/4', '.25', regex=True, inplace=True)
thievery['measurements'].replace(r'\s3/8', '.375', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/2', '.5', regex=True, inplace=True)
thievery['measurements'].replace(r'\s5/8', '.625', regex=True, inplace=True)
thievery['measurements'].replace(r'\s7/8', '.875', regex=True, inplace=True)

# 4. Change diameters to the respective number for area of a circle
# Get list of all diameters in column and convert to numeric area
diameters = list((3.14159*(thievery['measurements'].str.extract(r'Diameter:\s([0-9]+\.?([0-9]+)?)')[0].astype(float)/2)**2).astype(str))
measurements = list(thievery['measurements'].copy())

for i in range(len(thievery)):
    # Filter out NoneType
    if isinstance(thievery.loc[i, 'measurements'], str):
        if thievery.loc[i, 'measurements'].find('Diameter') != -1:
            # Sub out "Diameter: __" with diameter from list
            thievery.loc[i, 'measurements'] = re.sub(r'(Diameter:\s[0-9]+(\.[0-9]+)?)', list(diameters)[i], measurements[i])

# 5. Separate standardized measurements into numbers and words
m_split = thievery['measurements'].str.split(' ', expand=True)
print(m_split.head(20))

# 6. Report whether measurement is length, area, or volume
# Get counts of not-nan values
counts = m_split.count(axis=1)

# 2  => __ in, i.e., one-dimensional or inches; 4 => __ x __ square; 6 => cubic
thievery['units'] = ['Cubic inches' if i == 6 else 'Square inches' if i == 4 else 'Inches' if i == 2 else None for i in counts]

# 7. Report size of object
# Convert to float and multiply across rows
m_split[0] = m_split[0].astype(float)
m_split[2] = m_split[2].astype(float)
m_split[4] = m_split[4].astype(float)
m_split[6] = m_split[0].multiply(m_split[2], axis=0, fill_value=1).multiply(m_split[4], axis=0, fill_value=1)

# convert to centimeters as needed
thievery['size'] = np.where(m_split[1].str.match('cm.*') | m_split[3].str.match('cm.*') | m_split[5].str.match('cm.*'), round(m_split[6]/2.54,3), round(m_split[6],3))


# Final Data

In [11]:
# drop measurements column, finalize column order
thievery = thievery.loc[:,['title', 'maker', 'crimeCategory', 'materials', 'period', 'size', 'units']]

In [12]:
# display first 20 rows of final table
from IPython.core.display import HTML
display(HTML(thievery.head(20).to_html()))

Unnamed: 0,title,maker,crimeCategory,materials,period,size,units
0,History of the Jing Dynasty in 130 Chapters,Unknown,books,Book,Other,105.13,Square inches
1,Interior of Fort Malakoff,James Robertson,photograph,Photograph,Other,80.0,Square inches
2,Abstract Pink Onyx Sculpture on Black Marble,Henry Klar; Gloria Klar,sculpture,"Onyx, Marble",20th century,1152.0,Square inches
3,Red Clay Dog,,sculpture,Clay,Other,1.654,Inches
4,Keno Trailed Bottle,Caddo Nation,ornamental-ceramic-wares,Ceramic,Other,4.291,Inches
5,Burned Black Ikebana Drawing (2),Dale Chihuly,drawing-watercolour,,Other,1260.0,Square inches
6,Ancient and Lost Rivers: Aknill,Terence La Noue,paintings,Mixed Media On Wood,Other,,
7,Arizona Trails,Olaf Wieghorst,paintings,Oil On Board,20th century,320.0,Square inches
8,Saint Anthony Miracles,Unknown,paintings,Oil On Canvas,18th century,324.803,Square inches
9,Saint Bartholomew,Rembrandt,paintings,Oil On Panel,17th century,634.25,Square inches


# Export to CSV

In [13]:
thievery.to_csv("stolen_art.csv", index=False)