This project was inspired by this university paper written back in 2015. I tried finding the datasets online but was unable to, so I decided to replicate the work done in this paper. 

http://uu.diva-portal.org/smash/get/diva2:846981/FULLTEXT01.pdf 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
prefix = "http://www.perverted-justice.com"
webpage_response = requests.get('http://www.perverted-justice.com/?archive=byUserVotes')

In [3]:
#stores the content of webpage we're scraping for links
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "lxml") #lxml is faster than html.parser"

In [4]:
#stores all 'a' tags with attribute "id" equal to "pedoLink"
pedo_links = soup.findAll("a", {"id": "pedoLink"})
count = len(pedo_links)
print(count)

627


Bingo! That's the number of pedo links provided on website! Making progress.

In [5]:
links = []
# We take the href part of the a tag, then marry that to the prefix, which is the base part for each link
for a in pedo_links:
    links.append(prefix+a["href"])

In [6]:
pedo_data = {}

metadata_list = []
# I want to pull all the span tags with attribute inText
# Then I want to pull the entire conversation 
for link in links:
    subpage = requests.get(link)
    subsoup = BeautifulSoup(subpage.content, "lxml")
    metadata= subsoup.findAll("span", {"class": "inText"}) #finds all the span tags that have class = inText
    str1 = ''.join(str(e) for e in metadata) #converts metadata into strings
    temp_list = re.compile('<span class="inText">(.*?)</span>', re.DOTALL).findall(str1) #pulls only the text between the span tags
    if len(temp_list) == 6:
        #From looking at the data, I noticed there were 5 rows where the data was shifted by one place as there was 
        #no age provided for these individuals. I chose to insert age 26 as this is the most frequent age.
        temp_list.insert(1, '26')
        print(temp_list)
    metadata_list.append(temp_list)

['Loren', '26', 'magic4isl2002', 'Poway, San Diego, California', 'Tiffanyjoy2cute', '13 year old', 'girl']
['Jacob', '26', 'jacob_cason2003', 'Nashville, Georgia', 'sassicassi93', '13 year old', 'girl']
['William', '26', 'welfare_isforwhitefolks', ', North Carolina', 'jewelykool', '14 year old', 'girl']
['Timothy', '26', 'baddboyysweetheart', 'Beaver Dam, Wisconsin', 'lil_suzie_qt', '13 year old', 'girl']
['Brian Richard Williams', '26', 'bmichigan69', 'Clarkston, Michigan', 'yayitskc95', '14 year old', 'girl']


In [7]:
#Convert list of list to dataframe
df = pd.DataFrame(metadata_list, columns=["pedo_name", "pedo_age", "pedo_userid", "city_state", "victim_userid", "victim_age", "victim_gender"])
print(df.head())

                        pedo_name   pedo_age                pedo_userid  \
0                      Paul Short         34  fleet_captain_jaime_wolfe   
1               Michael J. Coffey         54                DavieWants2   
2                             Joe         28              vamale_692005   
3                   Kalonji Woods         21               kalowoodsman   
4  Keith Prather, Timothy Thompso  44 and 26             thenewperson62   

                            city_state   victim_userid   victim_age  \
0                    Chicago, Illinois      sadlilgrrl  13 year old   
1             Suffolk County, New York   SnapShotDeath  14 year old   
2                 Alexandria, Virginia    sweet_erin78  14 year old   
3                 Radford VA, Virginia            keri     underage   
4  Conroe / Willis / Montgomery, Texas  jackies2cool4u  13 year old   

  victim_gender  
0          girl  
1           boy  
2          girl  
3          girl  
4          girl  


In [8]:
new = df["city_state"].str.split(",", expand=True)
print(new)

                                0            1     2
0                         Chicago     Illinois  None
1                  Suffolk County     New York  None
2                      Alexandria     Virginia  None
3                      Radford VA     Virginia  None
4    Conroe / Willis / Montgomery        Texas  None
..                            ...          ...   ...
622              Saint Petersburg      Florida  None
623                     Coldwater     Michigan  None
624                        Queens     New York  None
625                  Garden Grove   California  None
626                 West Millford   New Jersey  None

[627 rows x 3 columns]


In [9]:
df["city"]= new[0] 
df["state"]= new[1] 

In [10]:
print(df.head())

                        pedo_name   pedo_age                pedo_userid  \
0                      Paul Short         34  fleet_captain_jaime_wolfe   
1               Michael J. Coffey         54                DavieWants2   
2                             Joe         28              vamale_692005   
3                   Kalonji Woods         21               kalowoodsman   
4  Keith Prather, Timothy Thompso  44 and 26             thenewperson62   

                            city_state   victim_userid   victim_age  \
0                    Chicago, Illinois      sadlilgrrl  13 year old   
1             Suffolk County, New York   SnapShotDeath  14 year old   
2                 Alexandria, Virginia    sweet_erin78  14 year old   
3                 Radford VA, Virginia            keri     underage   
4  Conroe / Willis / Montgomery, Texas  jackies2cool4u  13 year old   

  victim_gender                          city      state  
0          girl                       Chicago   Illinois  
1   

In [11]:
#remove the "year old" from age column
new2 = df["victim_age"].str.split(" ",expand=True)
df["child_age"] = new2[0]
print(df.head())

                        pedo_name   pedo_age                pedo_userid  \
0                      Paul Short         34  fleet_captain_jaime_wolfe   
1               Michael J. Coffey         54                DavieWants2   
2                             Joe         28              vamale_692005   
3                   Kalonji Woods         21               kalowoodsman   
4  Keith Prather, Timothy Thompso  44 and 26             thenewperson62   

                            city_state   victim_userid   victim_age  \
0                    Chicago, Illinois      sadlilgrrl  13 year old   
1             Suffolk County, New York   SnapShotDeath  14 year old   
2                 Alexandria, Virginia    sweet_erin78  14 year old   
3                 Radford VA, Virginia            keri     underage   
4  Conroe / Willis / Montgomery, Texas  jackies2cool4u  13 year old   

  victim_gender                          city      state child_age  
0          girl                       Chicago   Illin

In [12]:
df = df.drop(['city_state', 'victim_age'], axis=1) #axis=1 specifies this is applied to columns (column operation)

In [13]:
df.groupby('victim_gender').count()

Unnamed: 0_level_0,pedo_name,pedo_age,pedo_userid,victim_userid,city,state,child_age
victim_gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
boy,52,52,52,52,52,52,52
girl,549,549,549,549,549,549,549


In [14]:
df.groupby('child_age').count()

Unnamed: 0_level_0,pedo_name,pedo_age,pedo_userid,victim_userid,victim_gender,city,state
child_age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11,2,2,2,2,2,2,2
12,46,46,46,46,46,46,46
12/13,1,1,1,1,1,1,1
13,332,332,332,332,332,332,332
13/14,1,1,1,1,1,1,1
14,184,184,184,184,184,184,184
15,23,23,23,23,23,23,23
underage,12,12,12,12,12,12,12


In [15]:
underaged = df.loc[df['child_age'] == 'underage']
print(underaged)

                   pedo_name pedo_age                pedo_userid  \
3              Kalonji Woods       21               kalowoodsman   
5            Thomas Humphrey       28               jackman_9682   
162            Tyson Jenkins       35                 ghost27_73   
165  Eduardo Ernesto Guillen       34                 tito_92503   
256                      Ted       33             bigdaddy68iou1   
292         Billy Wayne Rife       38               bwblueeyes04   
300                  Anthony       26  yp_anthony_louisville_284   
314                      Jed       22      ich_bin_der_eggman_67   
337                  Russell       34               cooperisaaca   
343                     Seda       23                  hey_malen   
509                     Jose       21                 jagjen2003   
591            Anurag Tawari       29                indianman76   

         victim_userid victim_gender          city        state child_age  
3                 keri          girl   

In [16]:
#I'm going to use the number in the victim_userid as a hint of the age of the individual
df.loc[5,'child_age'] = 13
df.loc[162,'child_age'] = 14
df.loc[256,'child_age'] = 14
df.loc[337,'child_age'] = 14
df.loc[343,'child_age'] = 13

In [17]:
# Since 13 is by far the most common child_age I'm going to replace underage with 13. 
df['child_age'] = df.child_age.str.replace('underage', '13') 
df['child_age'] = df.pedo_age.str.replace('12/13', '13') 
df['child_age'] = df.pedo_age.str.replace('13/14', '13') 

In [18]:
underaged = df.loc[df['child_age'] == 'underage']
print(underaged)

Empty DataFrame
Columns: [pedo_name, pedo_age, pedo_userid, victim_userid, victim_gender, city, state, child_age]
Index: []


In [20]:
df.groupby('child_age').count()

Unnamed: 0_level_0,pedo_name,pedo_age,pedo_userid,victim_userid,victim_gender,city,state
child_age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"""28""",1,1,1,1,1,1,1
19,6,6,6,6,6,6,6
20,8,8,8,7,7,8,8
21,25,25,25,24,24,25,25
22,28,28,28,26,26,28,28
23,34,34,34,34,34,34,34
24,30,30,30,29,29,30,30
25,26,26,26,23,23,26,26
26,43,43,43,43,43,43,43
27,30,30,30,29,29,30,30


In [22]:
df.groupby('pedo_age').count()

Unnamed: 0_level_0,pedo_name,pedo_userid,victim_userid,victim_gender,city,state,child_age
pedo_age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"""28""",1,1,1,1,1,1,1
19,6,6,6,6,6,6,6
20,8,8,7,7,8,8,8
21,25,25,24,24,25,25,25
22,28,28,26,26,28,28,28
23,34,34,34,34,34,34,34
24,30,30,29,29,30,30,30
25,26,26,23,23,26,26,26
26,43,43,43,43,43,43,43
27,30,30,29,29,30,30,30


In [29]:
df['pedo_age'] = df.pedo_age.str.replace('"28"', '28') 
#I decided to drop this row as it appears two individuals worked together and this was a one off situation. 
df = df.drop(df.index[4])

AttributeError: Can only use .str accessor with string values!

In [30]:
df.groupby('pedo_age').count()

Unnamed: 0_level_0,pedo_name,pedo_userid,victim_userid,victim_gender,city,state,child_age
pedo_age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19,6,6,6,6,6,6,6
20,8,8,7,7,8,8,8
21,25,25,24,24,25,25,25
22,28,28,26,26,28,28,28
23,34,34,34,34,34,34,34
24,30,30,29,29,30,30,30
25,26,26,23,23,26,26,26
26,43,43,43,43,43,43,43
27,30,30,29,29,30,30,30
28,26,26,26,26,26,26,26


In [31]:
#Checkout datatypes
df.dtypes

pedo_name        object
pedo_age          int64
pedo_userid      object
victim_userid    object
victim_gender    object
city             object
state            object
child_age        object
dtype: object

In [28]:
#Convert objects to integers
df['pedo_age']=df['pedo_age'].astype(str).astype(int)
df['child_age']=df['child_age'].astype(str).astype(int)

ValueError: invalid literal for int() with base 10: '"28"'

In [34]:
df.dtypes

pedo_name        object
pedo_age          int64
pedo_userid      object
victim_userid    object
victim_gender    object
city             object
state            object
child_age         int64
dtype: object

In [36]:
df.groupby('state').count()

Unnamed: 0_level_0,pedo_name,pedo_age,pedo_userid,victim_userid,victim_gender,city,child_age
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AL,1,1,1,1,1,1,1
Alabama,15,15,15,14,14,15,15
Arizona,4,4,4,4,4,4,4
Arkansas,5,5,5,5,5,5,5
CA,5,5,5,4,4,5,5
...,...,...,...,...,...,...,...
Washington,12,12,12,12,12,12,12
West Virginia,1,1,1,1,1,1,1
Wisconsin,21,21,21,21,21,21,21
Indiana,1,1,1,0,0,1,1


In [37]:
df.state.unique()

array([' Illinois', ' New York', ' Virginia', ' Oklahoma', ' Washington',
       ' Texas', ' California', ' Georgia', ' Florida', ' Maryland',
       ' Arizona', 'Oklahoma', ' Pennsylvania', ' Arkansas', ' MA',
       ' Missouri', ' Alabama', ' Connecticut', ' Minnesota', ' Oregon',
       ' Rhode Island', ' Michigan', ' Idaho', ' Ohio', ' Mississippi',
       ' Indiana', ' South Carolina', ' Wisconsin', ' Nevada',
       ' Kentucky', ' Colorado', ' Massachusetts', ' WI', ' Kansas',
       ' GA', ' District of Columbia', ' Utah', ' Louisiana', ' WA',
       ' Maine', ' North Dakota', ' New Hampshire', ' MI', ' CA', ' AL',
       ' KY', ' Nebraska', ' San Diego', ' Tennessee', ' New Jersey',
       ' Ok', ' FL', ' TN', ' OK', ' North Carolina', ' Calif.',
       'Indiana', ' OH', ' New Mexico', ' Vermont', ' West Virginia'],
      dtype=object)

In [40]:
df['state'] = df.state.str.replace('WI', 'Wisconsin') 
df['state'] = df.state.str.replace('MA', 'Massachusetts') 
df['state'] = df.state.str.replace('GA', 'Georgia') 
df['state'] = df.state.str.replace('WA', 'Washington') 
df['state'] = df.state.str.replace('MI', 'Michigan') 
df['state'] = df.state.str.replace('CA', 'California') 
df['state'] = df.state.str.replace('AL', 'Alabama') 
df['state'] = df.state.str.replace('KY', 'Kentucky') 
df['state'] = df.state.str.replace('San Diego', 'California') 
df['state'] = df.state.str.replace('Ok', 'Oklahoma') 
df['state'] = df.state.str.replace('FL', 'Florida') 
df['state'] = df.state.str.replace('TN', 'Tennessee') 
df['state'] = df.state.str.replace('OK', 'Oklahoma') 
df['state'] = df.state.str.replace('Calif.', 'California') 
df['state'] = df.state.str.replace('OH', 'Ohio') 
df['state'] = df.state.str.replace('Californiarnia', 'California') 
df['state'] = df.state.str.replace('Oklahomalahoma', 'Oklahoma') 

In [51]:
print(sorted(df['state'].unique()))

[' Alabama', ' Arizona', ' Arkansas', ' California', ' Californiarnia', ' Colorado', ' Connecticut', ' District of Columbia', ' Florida', ' Georgia', ' Idaho', ' Illinois', ' Indiana', ' Kansas', ' Kentucky', ' Kentuky', ' Louisiana', ' Maine', ' Maryland', ' Massachusetts', ' Michigan', ' Minnesota', ' Mississippi', ' Missouri', ' Nebraska', ' Nevada', ' New Hampshire', ' New Jersey', ' New Mexico', ' New York', ' North Carolina', ' North Dakota', ' Ohio', ' Oklahoma', ' Oklahomalahoma', ' Oregon', ' Pennsylvania', ' Rhode Island', ' South Carolina', ' Tennessee', ' Texas', ' Utah', ' Vermont', ' Virginia', ' Washington', ' West Virginia', ' Wisconsin', 'Indiana', 'Oklahomalahoma']


In [38]:
print(sorted(df['city'].unique()))

array(['Chicago', 'Suffolk County', 'Alexandria', 'Radford VA', 'Tulsa',
       'Blaine', 'Houston', 'Long Beach', 'Tifton', 'Coconut Creek',
       'Rockville', 'Tucson', 'Lawton', 'Oakhurst', 'Wilkes Barre',
       'Fayetteville', 'Utica', 'Cambridge', 'Orland Park',
       'Falls Church', 'Hannibal', 'Phoenix', 'Cusseta', 'Enfield',
       'Northfield', 'Astoria', 'Warwick', 'Palm Springs', 'Vacaville',
       'Reading', 'Stanton', 'Killeen', 'Detroit', 'Richland Township PA',
       'Fortuna', 'Naples', 'Idaho Falls', 'Cincinnati', 'Valley',
       'Conyers', 'Jackson', 'Putnam City (a suburb of Oklahoma City)',
       '', 'Lawton/Ft. Sill', 'Corona', 'Bridgeport', 'Anaheim',
       'Portsmouth', 'Mastic Beach', 'San Diego', 'Canton', 'St Louis',
       'Trenton', 'Schereville', 'Ellensburg', 'Puyallup', 'Santa Rosa',
       'San Antonio', 'Fullerton', 'Walterboro', 'Columbus', 'Gilroy',
       'Cinncinatti', 'Whittier', 'Dodgeville', 'Everett', 'Eureka',
       'Port Huron', 'Las 

In [31]:
#Graphs of ages, cities, states
#NLP? - break out words from usernames, sentiment analysis?

In [32]:
#https://stackoverflow.com/questions/57484530/beautifulsoup-find-all-and-get-text for conversation part of website