# Data Processing

In [1]:
import numpy as np
import pandas as pd

import pickle as pk

## Data

In [133]:
# US identified serial killers from Wikipedia

us_sk_id = pd.read_csv('data/serial_killers_identified_RAW.csv', index_col=0)
us_sk_id.head()

Unnamed: 0,Name,Years active,Proven Victims,Possible Victims,Status,Notes,Ref
0,"Ables, Tony",1970–1990,4,4+,Sentenced to death; commuted to life imprisonment,"Murdered robbery victim in 1970, and at least ...",[4]
1,"Acevedo, Francisco",1989–1996,3,3,Sentenced to 75 years to life,Strangled three prostitutes in New York betwee...,[5]
2,"Adams, Edward James",1920–1921,7,7,Killed by police during shootout,"Murdered seven people, including three policemen",[6]
3,"Agrue, John",1966–1982,3,3+,Died in 2009,Killed his sister-in-law in Illinois; paroled ...,[7]
4,"Albanese, Charles",1980–1981,3,3,Executed 1995,Poisoned family members with arsenic in Fox La...,[8]


In [134]:
# US unidentified serial killers from Wikipedia

us_sk_unid = pd.read_csv('data/serial_killers_unidentified_RAW.csv', index_col=0)
us_sk_unid.head()

Unnamed: 0,Name,Years active,Proven Victims,Possible Victims,Region where active,Notes,Ref
0,Albuquerque serial killer,2021–2022,2,4,NM,Suspected of killing four Muslim men in drive-...,[786]
1,Alphabet murders,1971–1973,3,3,NY,"Also known as the ""Double Initial Murders""; mu...",[787]
2,Ann Arbor Hospital murders,1975,10,10,MI,Poisonings of 10 patients at the Veteran's Adm...,[788][789]
3,Atlanta child murders,1979–1981,28,30,GA,"A series of murders committed in Atlanta, Geor...",
4,Atlanta Lover's Lane Murders,1977,3,3,GA,A series of unsolved shootings on couples in A...,


In [135]:
us_sk_id.drop('Ref', inplace=True, axis=1)
us_sk_unid.drop('Ref', inplace=True, axis=1)

In [136]:
us_sk_id['Location'] = np.nan
us_sk_unid['Location'] = np.nan

In [137]:
with open('data/US_states_and_cities.pickle', 'rb') as f:
    US = pk.load(f)

In [138]:
def find_location(row):
    result = []
    for state in US.keys():
        location = []
        if state in row:
            location.append(state)
            c = np.nan
            for city in US[state]:
                if 'Saint' in city:
                    city = city.replace('Saint', 'St.')
                if city in row:
                    c = city
            location.append(c)
        if location:
            result.append(location)
    return result



In [139]:
us_sk_id['Location']   = us_sk_id['Notes'].apply(find_location)
us_sk_unid['Location'] = us_sk_unid['Notes'].apply(find_location)

In [140]:
us_sk_id.head()

Unnamed: 0,Name,Years active,Proven Victims,Possible Victims,Status,Notes,Location
0,"Ables, Tony",1970–1990,4,4+,Sentenced to death; commuted to life imprisonment,"Murdered robbery victim in 1970, and at least ...","[[Florida, St. Petersburg]]"
1,"Acevedo, Francisco",1989–1996,3,3,Sentenced to 75 years to life,Strangled three prostitutes in New York betwee...,"[[New York, nan]]"
2,"Adams, Edward James",1920–1921,7,7,Killed by police during shootout,"Murdered seven people, including three policemen",[]
3,"Agrue, John",1966–1982,3,3+,Died in 2009,Killed his sister-in-law in Illinois; paroled ...,"[[Colorado, nan], [Illinois, nan]]"
4,"Albanese, Charles",1980–1981,3,3,Executed 1995,Poisoned family members with arsenic in Fox La...,"[[Illinois, nan]]"


In [141]:
us_sk_id.loc[3]['Location'][0][0]

'Colorado'

In [142]:
us_sk_unid.head()

Unnamed: 0,Name,Years active,Proven Victims,Possible Victims,Region where active,Notes,Location
0,Albuquerque serial killer,2021–2022,2,4,NM,Suspected of killing four Muslim men in drive-...,[]
1,Alphabet murders,1971–1973,3,3,NY,"Also known as the ""Double Initial Murders""; mu...","[[New York, Rochester]]"
2,Ann Arbor Hospital murders,1975,10,10,MI,Poisonings of 10 patients at the Veteran's Adm...,[]
3,Atlanta child murders,1979–1981,28,30,GA,"A series of murders committed in Atlanta, Geor...","[[Georgia, Atlanta]]"
4,Atlanta Lover's Lane Murders,1977,3,3,GA,A series of unsolved shootings on couples in A...,[]


In [144]:
us_sk_unid.loc[3]['Location'][0][1]

'Atlanta'

In [146]:
us_sk_id.to_csv('data/serial_killers_identified_LOCATION.csv')
us_sk_unid.to_csv('data/serial_killers_unidentified_LOCATION.csv')

In [154]:
us_sk_id = pd.read_csv('data/serial_killers_identified_LOCATION.csv', index_col=0)
us_sk_unid = pd.read_csv('data/serial_killers_unidentified_LOCATION.csv', index_col=0)

In [155]:
def define_years(df):
    for i in range(len(df)):
        years = df.loc[i, 'Years active'].split('–')
        if len(years) < 2:
            years.append(np.nan)
        df.loc[i, ['Start Year', 'Year End']] = years

In [156]:
define_years(us_sk_id)
define_years(us_sk_unid)

In [157]:
us_sk_unid.head()

Unnamed: 0,Name,Years active,Proven Victims,Possible Victims,Region where active,Notes,Location,Start Year,Year End
0,Albuquerque serial killer,2021–2022,2,4,NM,Suspected of killing four Muslim men in drive-...,[],2021,2022.0
1,Alphabet murders,1971–1973,3,3,NY,"Also known as the ""Double Initial Murders""; mu...","[['New York', 'Rochester']]",1971,1973.0
2,Ann Arbor Hospital murders,1975,10,10,MI,Poisonings of 10 patients at the Veteran's Adm...,[],1975,
3,Atlanta child murders,1979–1981,28,30,GA,"A series of murders committed in Atlanta, Geor...","[['Georgia', 'Atlanta']]",1979,1981.0
4,Atlanta Lover's Lane Murders,1977,3,3,GA,A series of unsolved shootings on couples in A...,[],1977,


In [158]:
us_sk_id.head()

Unnamed: 0,Name,Years active,Proven Victims,Possible Victims,Status,Notes,Location,Start Year,Year End
0,"Ables, Tony",1970–1990,4,4+,Sentenced to death; commuted to life imprisonment,"Murdered robbery victim in 1970, and at least ...","[['Florida', 'St. Petersburg']]",1970,1990
1,"Acevedo, Francisco",1989–1996,3,3,Sentenced to 75 years to life,Strangled three prostitutes in New York betwee...,"[['New York', nan]]",1989,1996
2,"Adams, Edward James",1920–1921,7,7,Killed by police during shootout,"Murdered seven people, including three policemen",[],1920,1921
3,"Agrue, John",1966–1982,3,3+,Died in 2009,Killed his sister-in-law in Illinois; paroled ...,"[['Colorado', nan], ['Illinois', nan]]",1966,1982
4,"Albanese, Charles",1980–1981,3,3,Executed 1995,Poisoned family members with arsenic in Fox La...,"[['Illinois', nan]]",1980,1981


In [160]:
us_sk_id.dtypes

Name                object
Years active        object
Proven Victims      object
Possible Victims    object
Status              object
Notes               object
Location            object
Start Year          object
Year End            object
dtype: object

In [147]:
# us_sk_id['Start Year'] = pd.to_numeric(us_sk_id['Start Year'], downcast='integer', errors='coerce')
# us_sk_id['Year End'] = pd.to_numeric(us_sk_id['Year End'], downcast='integer', errors='coerce')