In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import random as rand

In [2]:
# Get 16 random pages from the 72 available
# pages = [str(x) for x in rand.sample(range(1,73), 16)]

# Keep this set to aid with data cleaning and analysis consistency
pages_ex = ['31','21','54','22','27','26','6','18','55','9','68','15','40','53','62','43']

In [3]:
# Get 25 stolen items per page from sample
params = [{"pageSize" : "25", "page" : x} for x in pages_ex]
base_url = "https://api.fbi.gov/@artcrimes"
rs = [requests.get(base_url, param) for param in params]

In [258]:
# Concatenate all json items dicts from sampled pages
thievery = pd.concat([pd.DataFrame(r.json()['items']) for r in rs], ignore_index=True)

In [259]:
# Keep only columns of interest
thievery = thievery.loc[:,['title', 'maker', 'crimeCategory', 'materials', 'period', 'measurements']]

In [None]:
# Crime Category: standardize description separation method
thievery['crimeCategory'].replace(',', '-', regex=True, inplace=True)

# Special cases 
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "dollsandfigurines", "dolls-and-figurines", thievery['crimeCategory'])
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "stringinstruments", "string-instruments", thievery['crimeCategory'])

# Check value counts for other special cases (won't worry about converting None to NA)
# dict(thievery['crimeCategory'].value_counts())

In [7]:
# Materials: standardize to propercase
thievery['materials'] = thievery['materials'].str.title()

# Check for special cases
# dict(thievery['materials'].value_counts())

In [None]:
# Period: group by century when able, move to other category if not
# 1. standardize to lowercase
thievery['period'] = thievery['period'].str.lower()

# 2. take out qualifiers within century (mid, late, early)
thievery['period'] = thievery['period'].str.replace(r'(mid\s)|(late\s)|(early\s)', '', regex=True)

# 3. move misc. and date ranges that cover multiple centuries to other category
# a. extract centuries from ranges to dataframe (e.g., 1850-1970 would return 18 and 19)
ranges = thievery['period'].str.extract(r'([0-9]{2}).*\-([0-9]{2}).*')

# b. for each range, return "__th century" if the range is within one century, send to other category if not
thievery['period'] = np.where(thievery['period'].str.contains('\-'), \
    np.where(ranges[0]==ranges[1], ranges[0]+'th century', 'Other'), \
    thievery['period'])

# 4. move years to their respective centuries (treat circa as exact year)
# for loop over 13th-20th centuries (before 1200 it's unlikely to have a specific year; 21st century has unique suffix)
for i in range(12, 20):
    thievery['period'] = np.where(thievery['period'].str.match('.*?'+ str(i) + '\d{2}'), str(i+1) + 'th century', thievery['period'])

# special cases (some hardcoding but oh well...), with BC specified but AD considered "default"
thievery['period'] = np.where(thievery['period'].str.match(r'.*20[0-9]{2}'), '21st century', thievery['period']) 
thievery['period'] = np.where(thievery['period'] == '19th c.', '19th century', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '600', '6th century', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '500 bc', '5th century bc', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '1st century ad', '1st century', thievery['period'])

# 5. move remaining non-century periods to Other (era like contemporary, coded incorrectly, etc.)
thievery['period'] = np.where(thievery['period'].str.contains('century'), thievery['period'], 'Other')

# Check for special cases
# dict(thievery['period'].value_counts())

In [None]:
# Maker: combine unknown and anonymous into one category
thievery['maker'] = np.where(thievery['maker'].str.match('(unknown)|(anon.*?)', flags=re.IGNORECASE), "Unknown", thievery['maker'])

# Check for special cases (spelling, etc.)
# dict(thievery['maker'].value_counts())

In [260]:
# Measurements: get size of piece in inches
# special cases (there are a lot, and it could be condensed; I don't know the best way for that)
thievery['measurements'].replace('Approximately 7.5 inches maximum width along blade; approximately 14 inches overall length', '14 in', inplace=True)
thievery['measurements'].replace('Sheet size: 53.00 x 35.00 cm; Print size: 35.00 x 27.00 cm', inplace=True)
thievery['measurements'].replace('Height: 7 5/8 in;  Base Diameter: 2.5 in; Max Diameter: 3 7/8 in; Rim Diameter: 3.5 in', '7.625 x Diameter: 2.5 in', inplace=True)
thievery['measurements'].replace('Height: 5 in\r\n\r\nDiameter: 4in.', '5 in x Diameter: 4 in', inplace=True)
thievery['measurements'].replace('Height: 7.5in; Width: 7in', '7.5 x 7 in', inplace=True)
thievery['measurements'].replace('2.88" x 2.88\'', '2.88 x 2.88 in', inplace=True)
thievery['measurements'].replace('Height: 108 in; Length: 11.5 in; Width: 1.5 in; Weight: 250 lbs', '108 x 11.5 x 1.5 in', inplace=True)
thievery['measurements'].replace('Height: 4in; Width: 11in', '4 x 11 in', inplace=True)
thievery['measurements'].replace('17 x 28in.', '17 x 28 in', inplace=True)
thievery['measurements'].replace('2.88" x 2.88\'', '2.88 x 2.88 in', inplace=True)
thievery['measurements'].replace('4\' 5"', '53 in', inplace=True)
thievery['measurements'].replace('Height: 16.5 in; Width: 12 in; Diameter: 7.5 in; Weight 5.55 lbs', '16.5 x 12 x Diameter: 7.5 in', inplace=True)
thievery['measurements'].replace('9.5 x 7 .5 in', '9.5 x 7.5 in', inplace=True)
thievery['measurements'].replace('20\'" x 24"', '20 x 24 in', inplace=True)
thievery['measurements'].replace('3 x 5 1/2 x 3/4 in', '3 x 5.5 x .75 in', inplace=True)

	
# not enough info to get length, area, or volume
thievery['measurements'].replace('Base Diameter: 2.25"; Middle Diameter: 3.5"; Neck Diameter: 1.5 "; Top Diameter: 3.5"', 'nan', inplace=True)
thievery['measurements'].replace('83.6 grams', 'nan', inplace=True)
thievery['measurements'].replace('Height: 15.5 in.; Width: 19.5 in.; Diameter: 24 in.', 'nan', inplace=True)
thievery['measurements'].replace('18.75 x 17 in, y: 11 in', 'nan', inplace=True)
thievery['measurements'].replace('7.00 cm, y: 68 cm', 'nan', inplace=True)
thievery['measurements'].replace('64 1/8 x 33 7/16 in, Y: 19 5/8 in', 'nan', inplace=True)
thievery['measurements'].replace('Bowl', 'nan', inplace=True)
thievery['measurements'].replace('Scroll', 'nan', inplace=True)
thievery['measurements'].replace('Panel', 'nan', inplace=True)
thievery['measurements'].replace('Not including frame', 'nan', inplace=True)
thievery['measurements'].replace('Early 17th Century', 'nan', inplace=True)

# 1. Change " to in to standardize how units are reported
thievery['measurements'].replace('\"', ' in', regex=True, inplace=True)

# 2. Replace all ";" with " x"; 
thievery['measurements'].replace(';', ' x', regex=True, inplace=True)

# 3. Remove certain words/phrases (not enough info to get length, area, or volume)
thievery['measurements'].replace(r'([Dd]epth:?\s)|((\sin\s)?[Hh]eight:?\s?)|([Ww]idth:?\s)|([Ll]ength:?\s)|(\s?\(?[Aa]pproximately\)?\s?)|(\,\sframe$)', '', regex=True, inplace=True)

# 3. Replace fractions with decimals, case-by-case (not the best method but best I can do lol)
thievery['measurements'].replace(r'\s1/8', '.125', regex=True, inplace=True)
thievery['measurements'].replace(r'\s3/16', '.1875', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/5', '.2', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/4', '.25', regex=True, inplace=True)
thievery['measurements'].replace(r'\s3/8', '.375', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/2', '.5', regex=True, inplace=True)
thievery['measurements'].replace(r'\s5/8', '.625', regex=True, inplace=True)
thievery['measurements'].replace(r'\s7/8', '.875', regex=True, inplace=True)

# 4. Remove in and cm except at end of string, make sure in and cm are last word in string

thievery['measurements'].replace(r'(in\.?|cm\.?)\sx', 'x', regex=True, inplace=True)

# 5. Change diameter to the circle area of that number
diameters = list(round((3.14159*(thievery['measurements'].str.extract(r'Diameter:\s([0-9]+\.?([0-9]+)?)')[0].astype(float)/2)**2),3).astype(str))
measurements = list(thievery['measurements'].copy())

for i in range(len(thievery)):
    if isinstance(thievery.loc[i, 'measurements'], str):
        if thievery.loc[i, 'measurements'].find('Diameter') != -1:
            thievery.loc[i, 'measurements'] = re.sub(r'(Diameter:\s[0-9]+(\.[0-9]+)?)', list(diameters)[i], measurements[i])
            #print(thievery.loc[i, 'measurements'].rfind(r'(Diameter:\s([0-9]+\.?([0-9]+)?))'))

# 1. one measurement given: extract number, convert to inches if needed, set as length

# thievery['length']

# thievery['area']

# thievery['volume']


In [None]:
dict(thievery['measurements'].value_counts())

In [305]:
m_split = thievery['measurements'].str.split('\s', expand=True)

m_split[0] = m_split[0].astype(float)
m_split[2] = m_split[2].astype(float)
m_split[4] = m_split[4].astype(float)

m_split[0] = 

thievery['size'] = 0
thievery['unit'] = ''

thievery['size'] = [np.where(np.isnan(m_split.loc[i, 2]), m_split.loc[i, 0], 0) for i in range(len(m_split))]



In [309]:
print(m_split[0].isna())

0      False
1      False
2      False
3      False
4      False
       ...  
395    False
396    False
397    False
398    False
399    False
Name: 0, Length: 400, dtype: bool


In [307]:
m_split

Unnamed: 0,0,1,2,3,4,5
0,20.7,x,12.900,cm,,
1,10.0,x,8.000,in,,
2,48.0,x,24.000,in,,
3,4.2,cm,,,,
4,10.9,cm,,,,
...,...,...,...,...,...,...
395,43.0,x,31.000,in,,
396,17.0,x,13.250,in,,
397,2.0,x,0.785,in,,
398,18.0,x,32.000,in,,


In [306]:
from IPython.core.display import HTML
display(HTML(thievery.to_html()))

Unnamed: 0,title,maker,crimeCategory,materials,period,measurements,size
0,History of the Jing Dynasty in 130 Chapters,Unknown (Chinese in origin),books,Book,Song period,20.70 x 12.90 cm,0.0
1,Interior of Fort Malakoff,James Robertson,photograph,Photograph,,10 x 8 in,0.0
2,Abstract Pink Onyx Sculpture on Black Marble,Henry Klar; Gloria Klar,sculpture,"Onyx, marble",20th Century,48 x 24 in,0.0
3,Red Clay Dog,,sculpture,Clay,,4.20 cm,4.2
4,Keno Trailed Bottle,Caddo Nation,ornamental-ceramic-wares,Ceramic,600 AD,10.90 cm,10.9
5,Burned Black Ikebana Drawing (2),Dale Chihuly,drawing-watercolour,,,42 x 30 in,0.0
6,Ancient and Lost Rivers: Aknill,Terence La Noue,paintings,Mixed media on wood,1998-2000,,
7,Arizona Trails,Olaf Wieghorst,paintings,Oil on board,1981,16 x 20 in,0.0
8,Saint Anthony Miracles,Anonymous,paintings,Oil on canvas,18th Century,25.00 x 33.00 cm,0.0
9,Saint Bartholomew,Rembrandt,paintings,Oil on panel,17th Century,29.5 x 21.5 in,0.0
