In [7]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import datetime
import re
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.svm import LinearSVC

# Building a corpus of concatenated conversations between users
each row in data corpus stands for a complete chat between users

In [3]:
def get_labels_dict(data_path):
    labels_dict = {}
    with open(data_path + 'sci_labels.csv', 'r') as f:
        file = csv.reader(f)
        for row in file:
            labels_dict[row[0]] = row[1]
    return labels_dict


def get_features_labels(root, labels_dict):
    corpus = [] # each row is a string formed from all messages in a conversations
    labels = [] # each row is 0 or 1, corresponds to label for same row in corpus

    for conversation in root:
        string = " "
        for message in conversation:
            text = message.find('text').text
            if text is not None:
                string = string + "\r\n" + text 
        corpus.append(string)
        labels.append(int(labels_dict[conversation.get('id')]))
    return corpus, labels

In [4]:
train_data_path = "data/pan12-sexual-predator-identification-training-corpus-2012-05-01/"

training_xml = ET.parse(train_data_path + 'training_data.xml')
train_root = training_xml.getroot()

test_data_path = 'data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_path + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
test_root = test_xml.getroot()

train_corpus, train_labels = get_features_labels(train_root, get_labels_dict(train_data_path))
test_corpus, test_labels = get_features_labels(test_root, get_labels_dict(test_data_path))

# TF-IDF on data
We will now represent all conversations using BOW with TF-IDF weighting scheme.

In [5]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)

X_train = scipy.sparse.csr_matrix(X_train)
y_train = np.array(train_labels)
X_test = scipy.sparse.csr_matrix(X_test)
y_test = np.array(test_labels)

In [9]:
#SVM
model = LinearSVC(random_state=0, C=2.9, loss='squared_hinge', dual=True)
model.fit(X_train, y_train)
pred_y = model.predict(X_test)
print(metrics.accuracy_score(y_test, pred_y))

0.9851026249290908


In [10]:
from sklearn.metrics import classification_report

In [11]:
target_names = ['Victim', 'Predator']
print(classification_report(y_test, pred_y, target_names=target_names))

              precision    recall  f1-score   support

      Victim       0.99      1.00      0.99    151391
    Predator       0.76      0.55      0.64      3737

    accuracy                           0.99    155128
   macro avg       0.88      0.77      0.82    155128
weighted avg       0.98      0.99      0.98    155128



In [12]:
# linear kernel SVM 
model = svm.SVC(kernel='linear', C=1, gamma='auto', random_state=0)
model.fit(X_train, y_train)
pred_y = model.predict(X_test)
print(metrics.accuracy_score(y_test, pred_y))
print(classification_report(y_test, pred_y, target_names=target_names))

0.9856054355113197
              precision    recall  f1-score   support

      Victim       0.99      1.00      0.99    151391
    Predator       0.79      0.55      0.65      3737

    accuracy                           0.99    155128
   macro avg       0.89      0.77      0.82    155128
weighted avg       0.98      0.99      0.98    155128



# Try PJ dataset as test set on trained corpus

In [17]:
from os import listdir
from os.path import isfile, join
import warnings
mypath = 'GeneralData'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles.remove('chatlog.dtd')


In [31]:
corpus_pj = [] # each row is a string formed from all messages in a conversations
labels_pj = [] # each row is 0 or 1, corresponds to label for same row in corpus
for file in onlyfiles:
    df= pd.read_xml(f'GeneralData/{file}').drop(columns=['COMMENT'])
    df=df[~df['BODY'].isna()]
    df=df[~df['USERNAME'].isna()]
    string = " "
    for text in df.BODY.tolist():
        if text is not None:
            string = string + "\r\n" + text 
    corpus_pj.append(string)
    labels_pj.append(1)

In [34]:
X_test = vectorizer.transform(corpus_pj)
X_test = scipy.sparse.csr_matrix(X_test)
y_test = np.array(labels_pj)

In [42]:
pred_y = model.predict(X_test)
print(metrics.accuracy_score(y_test, pred_y))
np.unique(pred_y,return_counts=True)

0.9642857142857143


(array([0, 1]), array([ 2, 54], dtype=int64))