In [1]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import datetime
import re
import csv

# Understanding the training data

In [2]:
train_data_path = "data/pan12-sexual-predator-identification-training-corpus-2012-05-01/"
test_data_path = "data/pan12-sexual-predator-identification-test-corpus-2012-05-21/"

training_xml = ET.parse(train_data_path + 'pan12-sexual-predator-identification-training-corpus-2012-05-01.xml')
root = training_xml.getroot()      

In [4]:
all_predators = {}
with open(train_data_path + 'pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        all_predators[row[0]] = 1

In [6]:
unfiltered_xml = ET.parse(train_data_path + 'pan12-sexual-predator-identification-training-corpus-2012-05-01.xml')
unfiltered_root = unfiltered_xml.getroot() 
training_xml = ET.parse(train_data_path + 'pan12-sexual-predator-identification-training-corpus-2012-05-01.xml')
unfiltered_root = training_xml.getroot() 
unfiltered_num_conv = len(unfiltered_root)
unfiltered_num_predators = 0
unfiltered_num_suspicious_conv = 0
unfiltered_num_authors = 0
unfiltered_authors = {}
unfiltered_predators = {}
for conversation in unfiltered_root:
    is_suspicious = False
    for message in conversation:
        author = message.find('author').text
        if author not in unfiltered_authors:
            unfiltered_authors[author] = 1
        if author in all_predators:
            is_suspicious = True
            if author not in unfiltered_predators:
                unfiltered_predators[author] = 1
    if is_suspicious:
        unfiltered_num_suspicious_conv += 1
unfiltered_num_authors = len(unfiltered_authors)
unfiltered_num_predators = len(unfiltered_predators)

print("Num of Conv: {}\r\nNum of suspicious conv: {}\r\nNum authors: {}\r\nNum predators: {}".format(\
                                                                                                        unfiltered_num_conv,\
                                                                                                      unfiltered_num_suspicious_conv,\
                                                                                                      unfiltered_num_authors,\
                                                                                                      unfiltered_num_predators))

Num of Conv: 66927
Num of suspicious conv: 2016
Num authors: 97689
Num predators: 142


### Prefiltering Training Data
First, we want to do the required pre-filtering of training data. Any conversation meeting any of the following criteria are removed:
1. conversations with only one participant
2. conversations with each user having 6 or less messages
3. conversations with long sequences of unrecognized characters

In [7]:
train_data_path = "data/pan12-sexual-predator-identification-training-corpus-2012-05-01/"
test_data_path = "data/pan12-sexual-predator-identification-test-corpus-2012-05-21/"

training_xml = ET.parse(train_data_path + 'pan12-sexual-predator-identification-training-corpus-2012-05-01.xml')
root = training_xml.getroot()        

1. remove conversations with only one participant

In [None]:
conv_2_remove = []
authors = []
init_len = len(root)

for conversation in root:
    authors.clear()
    
    # find all unique authors in this conversation
    for message in conversation:
        author = message.find('author').text
        if author not in authors:
            authors.append(author)
    
    # remove if one or less authors
#     if (len(authors)) <= 1:
#         root.remove(conversation)
    if (len(authors)) <= 1 and \
    conversation.get('id') not in conv_2_remove:
        conv_2_remove.append(conversation.get('id'))

# print("Removing {} out of {} conversations".format(init_len - len(root), init_len))
print("Removing {} out of {} conversations".format(len(conv_2_remove), init_len))

2.  conversations with each user having less than 6 messages

In [None]:
for conversation in root:
    if conversation.get('id') in conv_2_remove:
        continue
    authors = {}
    for message in conversation:
        author = message.find('author').text
        if author in authors:
            authors[author] = authors[author] + 1
        else:
            authors[author] = 1
    remove = True
    for author in authors:
        if authors[author] > 5:
            remove = False
            
#     if remove is True:
#         root.remove(conversation)
    if remove is True and \
    conversation.get('id') not in conv_2_remove:
        conv_2_remove.append(conversation.get('id'))

# print("Removing {} out of {} conversations".format(init_len - len(root), init_len))
print("Removing {} out of {} conversations".format(len(conv_2_remove), init_len))

3. remove any conversations with messages containing long sequences of unrecognized characters

In [None]:

for conversation in root:
    if conversation.get('id') in conv_2_remove:
        continue
    remove = False
    for message in conversation:
        text = message.find("text").text
        if text is None or len(text) < 20:
            continue
        match_str = re.findall("[\W_]", text)
        if len(match_str) / len(text) > 0.6:
            remove = True
            break
#     if remove is True:
#         root.remove(conversation)
    if remove is True and \
        conversation.get('id') not in conv_2_remove:
            conv_2_remove.append(conversation.get('id'))

print("Removing {} out of {} conversations".format(len(conv_2_remove), init_len))
# print("Removing {} out of {} conversations".format(init_len - len(root), init_len))

We have now removed 52224 out of 66927 conversations.<br>
Next we remove the conversations we want to remove from root itself and write a new xml.

In [None]:
for conversation in root.findall('conversation'):
    if conversation.get('id') in conv_2_remove:
        root.remove(conversation)
print("The new root has a length of {}.".format(len(root)))


In [None]:
from xml.etree.ElementTree import ElementTree
tree = ElementTree(root)
tree.write(open('data/training_data.xml', 'wb'))
print("Filtered data written!")

### Labeling training data

train_data_path = "PAN12/pan12-sexual-predator-identification-training-corpus-2012-05-01/"
test_data_path = "PAN12/pan12-sexual-predator-identification-test-corpus-2012-05-21/"

training_xml = ET.parse(train_data_path + 'pan12-sexual-predator-identification-training-corpus-2012-05-01.xml')

In [None]:
training_xml = ET.parse('PAN12/training_data.xml')
root = training_xml.getroot()

pred_id_file = 'PAN12/pan12-sexual-predator-identification-training-corpus-2012-05-01/'
file = open(pred_id_file + 'pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt', 'r')
predators_id = file.read().splitlines()

In [None]:
csv_labels = []
authors = []
for conversation in root:
    authors.clear()
    
    # find all unique authors in this conversation
    for message in conversation:
        author = message.find('author').text
        if author not in authors:
            authors.append(author)
    suspicious = False
    for author in authors:
        if author in predators_id:
            suspicious = True
            
    if suspicious:
        csv_labels.append([conversation.get('id'), 1])
    else:
        csv_labels.append([conversation.get('id'), 0])

# print(csv_labels)
with open(train_data_path + 'sci_labels.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(csv_labels)
print(f'Done writing labels for {len(csv_labels)} conversations!')

In [14]:
filtered_xml = ET.parse('data/training_data.xml')
filtered_root = filtered_xml.getroot()
csv_dict = {}
filtered_authors = {}
filtered_predators = {}
all_predators = {}

filtered_num_conv = len(filtered_root)
filtered_num_predators = 0
filtered_num_suspicious_conv = 0
filtered_num_authors = 0
unfiltered_data_path= 'data/pan12-sexual-predator-identification-training-corpus-2012-05-01/'

with open(unfiltered_data_path + 'sci_labels.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        csv_dict[row[0]] = int(row[1])

assert len(csv_dict) == filtered_num_conv
with open(unfiltered_data_path + 'pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        all_predators[row[0]] = 1

for conversation in filtered_root:
    conv_id = conversation.get('id')
    if csv_dict[conv_id] == 1:
        filtered_num_suspicious_conv += 1
    for message in conversation:
        author = message.find('author').text
        if author not in filtered_authors:
            filtered_authors[author] = 1
        if author in all_predators and author not in filtered_predators:
            filtered_predators[author] = 1
            
filtered_num_authors = len(filtered_authors)
filtered_num_predators = len(filtered_predators)

print("Num of Conv: {}\r\nNum of suspicious conv: {}\r\nNum authors: {}\r\nNum predators: {}".format(\
                                                                                                        filtered_num_conv,\
                                                                                                      filtered_num_suspicious_conv,\
                                                                                                      filtered_num_authors,\
                                                                                                      filtered_num_predators))

Num of Conv: 14703
Num of suspicious conv: 901
Num authors: 25099
Num predators: 137


### Labeling test data

In [8]:
test_data_src = "data/pan12-sexual-predator-identification-test-corpus-2012-05-21/"
test_xml = ET.parse(test_data_src + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
root = test_xml.getroot()

all_predators = {}
with open(test_data_src + 'pan12-sexual-predator-identification-groundtruth-problem1.txt', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        all_predators[row[0]] = 1

# metric
test_num_conv = len(root)
test_num_suspicious_conv = 0
test_num_predators = len(all_predators)
test_num_authors = 0

csv_labels = []
authors = {}
for conversation in root:
    suspicious = False
    for message in conversation:
        author = message.find('author').text
        if author not in authors:
            authors[author] = 1
        if author in all_predators:
            suspicious = True
    if suspicious:
        csv_labels.append([conversation.get('id'), 1])
        test_num_suspicious_conv += 1
    else:
        csv_labels.append([conversation.get('id'), 0])

# print(csv_labels)
with open(test_data_src + 'sci_labels.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(csv_labels)
    
test_num_authors = len(authors)
print("Num of Conv: {}\r\nNum of suspicious conv: {}\r\nNum authors: {}\r\nNum predators: {}".format(\
                                                                                                        test_num_conv,\
                                                                                                      test_num_suspicious_conv,\
                                                                                                      test_num_authors,\
                                                                                                      test_num_predators))

Num of Conv: 155128
Num of suspicious conv: 3737
Num authors: 218702
Num predators: 254
