In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import random as rand

In [None]:
# Get 16 random pages from the 72 available
# pages = [str(x) for x in rand.sample(range(1,73), 16)]

# Keep this set to aid with data cleaning and analysis consistency
pages_ex = ['31','21','54','22','27','26','6','18','55','9','68','15','40','53','62','43']

In [None]:
# Get 25 stolen items per page from sample
params = [{"pageSize" : "25", "page" : x} for x in pages_ex]
base_url = "https://api.fbi.gov/@artcrimes"
rs = [requests.get(base_url, param) for param in params]

In [71]:
# Concatenate all json items dicts from sampled pages
thievery = pd.concat([pd.DataFrame(r.json()['items']) for r in rs], ignore_index=True)

In [72]:
# Keep only columns of interest
thievery = thievery.loc[:,['title', 'maker', 'crimeCategory', 'materials', 'period', 'measurements']]

In [None]:
# Crime Category: standardize description separation method
thievery['crimeCategory'].replace(',', '-', regex=True, inplace=True)

# Special cases 
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "dollsandfigurines", "dolls-and-figurines", thievery['crimeCategory'])
thievery['crimeCategory'] = np.where(thievery['crimeCategory'] == "stringinstruments", "string-instruments", thievery['crimeCategory'])

# Check value counts for other special cases (won't worry about converting None to NA)
dict(thievery['crimeCategory'].value_counts())

In [None]:
# Materials: standardize to propercase
thievery['materials'] = thievery['materials'].str.title()

# Check for special cases
dict(thievery['materials'].value_counts())

In [None]:
# Period: group by century when able, move to other category if not
# 1. standardize to lowercase
thievery['period'] = thievery['period'].str.lower()

# 2. take out qualifiers within century (mid, late, early)
thievery['period'] = thievery['period'].str.replace(r'(mid\s)|(late\s)|(early\s)', '', regex=True)

# 3. move misc. and date ranges that cover multiple centuries to other category
# a. extract centuries from ranges to dataframe (e.g., 1850-1970 would return 18 and 19)
ranges = thievery['period'].str.extract(r'([0-9]{2}).*\-([0-9]{2}).*')

# b. for each range, return "__th century" if the range is within one century, send to other category if not
thievery['period'] = np.where(thievery['period'].str.contains('\-'), \
    np.where(ranges[0]==ranges[1], ranges[0]+'th century', 'Other'), \
    thievery['period'])

# 4. move years to their respective centuries (treat circa as exact year)
# for loop over 13th-20th centuries (before 1200 it's unlikely to have a specific year; 21st century has unique suffix)
for i in range(12, 20):
    thievery['period'] = np.where(thievery['period'].str.match('.*?'+ str(i) + '\d{2}'), str(i+1) + 'th century', thievery['period'])

# special cases (some hardcoding but oh well...), with BC specified but AD considered "default"
thievery['period'] = np.where(thievery['period'].str.match(r'.*20[0-9]{2}'), '21st century', thievery['period']) 
thievery['period'] = np.where(thievery['period'] == '19th c.', '19th century', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '600', '6th century', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '500 bc', '5th century bc', thievery['period'])
thievery['period'] = np.where(thievery['period'] == '1st century ad', '1st century', thievery['period'])

# 5. move remaining non-century periods to Other (era like contemporary, coded incorrectly, etc.)
thievery['period'] = np.where(thievery['period'].str.contains('century'), thievery['period'], 'Other')

# Check for special cases
dict(thievery['period'].value_counts())

In [None]:
# Maker: combine unknown and anonymous into one category
thievery['maker'] = np.where(thievery['maker'].str.match('(unknown)|(anon.*?)', flags=re.IGNORECASE), "Unknown", thievery['maker'])

# Check for special cases (spelling, etc.)
dict(thievery['maker'].value_counts())

In [79]:
# Measurements: get size of piece in inches
# 1. Change 17" to 17 in to standardize how units are reported
thievery['measurements'].replace('\"', ' in', regex=True, inplace=True)

# special case
thievery['measurements'] = np.where(thievery['measurements'] == "4' 5 in", '53 in', thievery['measurements'])

# 2. Remove cases of height, width, and length; replace all ";" with " x"; 
thievery['measurements'].replace(';', ' x', regex=True, inplace=True)

thievery['measurements'].replace(r'[Hh]eight:?\s|[Ww]idth:?\s|[Ll]ength:?\s', '', regex=True, inplace=True)

# 3. Replace fractions with decimals, case-by-case (not the best method but best I can do lol)
thievery['measurements'].replace(r'\s7/8', '.875', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/8', '.125', regex=True, inplace=True)
thievery['measurements'].replace(r'\s3/16', '.1875', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/4', '.25', regex=True, inplace=True)
thievery['measurements'].replace(r'\s5/8', '.625', regex=True, inplace=True)
thievery['measurements'].replace(r'\s3/8', '.375', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/5', '.2', regex=True, inplace=True)
thievery['measurements'].replace(r'\s1/2', '.5', regex=True, inplace=True)
thievery['measurements'].replace(r'\s3/4', '.75', regex=True, inplace=True)
thievery['measurements'].replace(r'\s7/16', '.4375', regex=True, inplace=True)

# 4. Remove in and cm except at end of string, make sure in and cm are last word in string
# special case
thievery['measurements'].replace('Approximately 7.5 inches maximum along blade x approximately 14 inches overall length', '7.5 in', inplace=True)

thievery['measurements'].replace(r'(in|cm)\sx', 'x', regex=True, inplace=True)

# 5. FIGURE OUT WHAT TO DO WITH DIAMETER

# Case 1: measurements separated by x
# 1. one measurement given: extract number, convert to inches if needed, set as length
# thievery['length']

# thievery['area']

# thievery['volume']

In [78]:
re.findall('(in|cm)\sx\s', '14.5 cm x 18 in')

['cm']

In [80]:
dict(thievery['measurements'].value_counts())

{'16 x 20 in': 5,
 '12 in': 4,
 '19.00 cm': 3,
 '10 in': 3,
 '24 x 30 in': 3,
 '12 x 15 in': 2,
 '53 in': 2,
 '37 x 47 in': 2,
 '18 in': 2,
 '26 x 34 in': 2,
 '8 x 10 in': 2,
 '15.5 x 10 in': 2,
 '30 x 24 in': 2,
 '8 in': 2,
 '18 cm': 2,
 '20 x 20 in': 2,
 '22 cm': 2,
 '38 x 38 in': 2,
 '13 in': 2,
 '40 x 30 in': 2,
 '10.5 in': 2,
 '25 in': 2,
 '8 x 6 in': 2,
 '24 x 18 in': 2,
 '42 x 60 in': 2,
 '14 x 18 in': 2,
 '30.00 x 26.50 cm': 1,
 '14 x 9 in': 1,
 '49.00 x 60.20 cm': 1,
 '78 x 58 in': 1,
 '26.70 x 19.40 cm': 1,
 '30 x 17 in': 1,
 '20.70 x 12.90 cm': 1,
 '6.5 x Diameter: 10.5 in': 1,
 '105.00 x 68.00 cm': 1,
 '14 x 11 in': 1,
 '41.25 x 29.625 in': 1,
 '60 x 36 in.': 1,
 '21 x 16 in, frame': 1,
 '14 x 11.5 in': 1,
 '13.5 x 10 in': 1,
 '7 x 9 in': 1,
 '36 x 24 in': 1,
 '83.6 grams': 1,
 '13.5 x 11 x 7 in': 1,
 '32 x 40 in': 1,
 '110.00 x 50.00 cm': 1,
 '4 x 2.5 in': 1,
 '23.75 x 29.5 in': 1,
 '14.5 x 18 in': 1,
 '111.00 x 51.00 x 33.00 cm': 1,
 '6.75 x 5.5 in': 1,
 '36 x 28 in': 1,


In [31]:
m_split = thievery['measurements'].str.split('\s', expand=True)

In [None]:
from IPython.core.display import HTML
display(HTML(thievery.to_html()))