In [2]:
# Lets start by writing a function for reading the PAN shared task data. The data is in the xml form, 
# one file per author/bot and each file contains several tweets. THere are also two files containing
# ground truth, 'truth-train.txt'and 'truth-dev.txt'

import tqdm 
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from tfidf_kingdom import *
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
import pandas as pd
import argparse
import os
from scipy import io

def read_gender(text_path):

    truth_train = join(text_path, 'truth-train.txt')
    truth_dev = join(text_path, 'truth-dev.txt')

    files = [join(text_path, f) for f in listdir(text_path) if isfile(join(text_path, f)) and f.endswith('xml')]

    train_dict = {}
    test_dict = {}



    for line in open(truth_train):
        l = line.split(':::')
        train_dict[l[0]] = (l[1].strip(), l[2].strip())
    for line in open(truth_dev):
        l = line.split(':::')
        test_dict[l[0]] = (l[1].strip(), l[2].strip())

    train_documents = []
    train_labels = []
    test_documents = []
    test_labels = []

    gender_train_documents = []
    gender_train_labels = []
    gender_test_documents = []
    gender_test_labels = []

    #lets loop the xml files
    for file in files:
        name = file.split('.')[-2].split('/')[-1]
        try:
            tree = ET.parse(file)
        except:
            continue
        root = tree.getroot()

        if name in train_dict:
            type, gender = train_dict[name]
        if name in test_dict:
            type, gender = test_dict[name]

        #concatenate all tweets in the file into a single text document
        concatenated_text = ""
        for document in root.iter('document'):
            if document.text:
                txt = beautify(document.text)
                tweet = txt.replace("\n", " ").replace("\t", " ")
                concatenated_text += tweet + "\n"

        # remove empty strings
        if concatenated_text:
            # build train and development set. Lets also separate the data produced by humans so that
            # we can use it for training the gender classifier
            if name in train_dict:
                if type != 'bot':
                    gender_train_documents.append(concatenated_text.strip())
                    gender_train_labels.append({'type': type, 'gender': gender})
                train_documents.append(concatenated_text.strip())
                train_labels.append({'type':type, 'gender': gender})
            if name in test_dict:
                if type != 'bot':
                    gender_test_documents.append(concatenated_text.strip())
                    gender_test_labels.append({'type': type, 'gender': gender})
                test_documents.append(concatenated_text.strip())
                test_labels.append({'type': type, 'gender': gender})
        else:
            print(name)

    print('Train size: ', len(train_documents))
    print('Val size: ', len(test_documents))
    return train_documents, train_labels, test_documents, test_labels, gender_train_documents, gender_train_labels, gender_test_documents, gender_test_labels

# remove html tags, used in PAN corpora
def beautify(text):
    return BeautifulSoup(text, 'html.parser').get_text()

In [3]:
# Next lets write a function that will take as input the train and development documents and labels 
# produced by the  previous function and convert it into a numpy matrix containing features that we 
# can feed to the classifier.


def parse_feeds(fname):
    train_documents, train_labels, test_documents, test_labels,  gender_train_documents, gender_train_labels, gender_test_documents, gender_test_labels = read_gender(fname)

    # Create preprocessed train and test datasets for detection of bots. See function build_dataframe in script
    # tfidf_kindom script for details about the preprocessing.
    print("Bot vectorizer")
    print("Computing MM")
    train_df = build_dataframe(train_documents)
    test_df = build_dataframe(test_documents)
    print('Dataframe built')
    
    # Create a bag-of-words model containing several tf-idf based features that can be feed to the classifier. 
    # See function get_tfidf_features in script tfidf_kindom script for details about the features.
    vectorizer = get_tfidf_features(train_df)
    feature_matrix = vectorizer.transform(train_df)
    test_feature_matrix = vectorizer.transform(test_df)
    print("Num features: ", feature_matrix.shape[1])
    print('Dataframe vectorized')
    full_vectorizer = vectorizer


    # Do the same thing as above, but now only on the subsample of the data that was written by humans, which will
    # be used for the training of the gender classifier. 
    print("Gender vectorizer")
    print("Computing MM")
    gender_train_df = build_dataframe(gender_train_documents)
    gender_test_df = build_dataframe(gender_test_documents)
    print('Dataframe built')


    gender_vectorizer = get_tfidf_features(gender_train_df)
    gender_feature_matrix = gender_vectorizer.transform(gender_train_df)
    gender_test_feature_matrix = gender_vectorizer.transform(gender_test_df)
    print("Num features: ", gender_feature_matrix.shape[1])
    print('Dataframe vectorized')
    gender_full_vectorizer = gender_vectorizer

    
    return (feature_matrix, test_feature_matrix, train_labels, test_labels, full_vectorizer), (gender_feature_matrix, gender_test_feature_matrix, gender_train_labels, gender_test_labels, gender_full_vectorizer)
 

In [9]:
# Ok, we have all we need for feature generation. Lets run it. First, lets define paths to the train corpus 
# and the output folder that will contain all the feature files. 
train_corpus = "../../data/pan19-author-profiling-training-2019-02-18/"
feature_folder = "../train_data"

if not os.path.exists(feature_folder):
    os.makedirs(feature_folder)

#PAN data is in two languages, lets build separate features for both languages and save it in the feature folders 
#This might take a while, so be patient. Also, ignore the beautiful soup warnings :)
for lang in ['en', 'es']:
    data_path = os.path.join(train_corpus, lang)
    bot, gender = parse_feeds(data_path)
    task = "bot"
    train_instances,test_instances, train_labels,test_labels,vectorizer = bot
    out_obj = {"train_features":train_instances,"test_features":test_instances}

    outfile = open(feature_folder + "/train_labels_" + task + "_" + lang + ".pickle",'wb')
    pickle.dump(train_labels,outfile)
    outfile.close()

    outfile = open(feature_folder + "/test_labels_" + task + "_" + lang + ".pickle",'wb')
    pickle.dump(test_labels,outfile)
    outfile.close()

    outfile = open(feature_folder + "/vectorizer_" + task + "_" + lang + ".pickle",'wb')
    pickle.dump(vectorizer, outfile)
    outfile.close()
    io.savemat(feature_folder + "/train_instances_" + task + "_" + lang + ".mat",out_obj)

    task = "gender"
    train_instances, test_instances, train_labels, test_labels, vectorizer = gender
    out_obj = {"train_features": train_instances, "test_features": test_instances}

    outfile = open(feature_folder + "/train_labels_" + task + "_" + lang + ".pickle", 'wb')
    pickle.dump(train_labels, outfile)
    outfile.close()

    outfile = open(feature_folder + "/test_labels_" + task + "_" + lang + ".pickle", 'wb')
    pickle.dump(test_labels, outfile)
    outfile.close()

    outfile = open(feature_folder + "/vectorizer_" + task + "_" + lang + ".pickle", 'wb')
    pickle.dump(vectorizer, outfile)
    outfile.close()
    io.savemat(feature_folder + "/train_instances_" + task + "_" + lang + ".mat", out_obj)






































































..." looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.












Train size:  2880
Val size:  1240
Bot vectorizer
Computing MM
Dataframe built
Num features:  126748
Dataframe vectorized
Gender vectorizer
Computing MM
Dataframe built
Num features:  74510
Dataframe vectorized


















































🙃🙃🙃" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



@attaque77ok" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.




































🤣" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.








#kudai" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.














Train size:  2080
Val size:  920
Bot vectorizer
Computing MM
Dataframe built
Num features:  103564
Dataframe vectorized
Gender vectorizer
Computing MM
Dataframe built
Num features:  62778
Dataframe vectorized


In [13]:
# Great, feature engineering is done and features have been written to the feature folder
# Now, lets write the code that will read the features and labels, and then feed them to the classifier. 
# We will need to import some libraries for that, so lets do that first. 
import scipy.io as sio
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import numpy as np
from sklearn.metrics import accuracy_score
import logging
from sklearn.externals import joblib
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
logging.getLogger().setLevel(logging.INFO)

import numpy
numpy.random.seed()


In [14]:
#We will also need a function for reading the labels from the files we saved during feature engineering
def load_labels(label_pickle,test_labels):

    type_train = []
    gender_train = []

    
    type_test = []
    gender_test = []

    with open(label_pickle, "rb") as input_file:
        e = pickle.load(input_file)
        
    with open(test_labels, "rb") as input_file:
        et = pickle.load(input_file)

    for el in e:
        type_train.append(el['type'])
        gender_train.append(el['gender'])

        
    for el in et:
        type_test.append(el['type'])
        gender_test.append(el['gender'])


    encoder_type = preprocessing.LabelEncoder().fit(type_train+type_test)
    encoder_gender = preprocessing.LabelEncoder().fit(gender_train+gender_test)

        
    label_object = {}
    label_object['gender'] = (encoder_gender.transform(gender_train),encoder_gender.transform(gender_test))
    label_object['type'] = (encoder_type.transform(type_train),encoder_type.transform(type_test))

    encoders = (encoder_type,encoder_gender)
    return label_object,encoders

In [16]:
#Finally, lets write the script for training the models on the train set and evaluating them on the validation sets
#Lets loop through both languages (English and Spanish) and both tasks (bot identification and gender classifaction)
for lang in ['en', 'es']:
    for task in ['bot', 'gender']:
        #For each task in a specific language lets read the labels and data          
        fname = feature_folder +  "/train_instances_" + task + "_" + lang + ".mat"
        labels_train = feature_folder +  "/train_labels_" + task + "_" + lang + ".pickle"
        labels_test = feature_folder +  "/test_labels_" + task + "_" + lang + ".pickle"

        dmat = sio.loadmat(fname)
        train_features = dmat['train_features']
        test_features = dmat['test_features']
        label_vectors,encoders = load_labels(labels_train,labels_test)
        encoder_type,encoder_gender = encoders

        outfile = open(feature_folder + "/encoder_" + task + "_" + lang + ".pickle",'wb')
        pickle.dump(encoder_type,outfile)
        outfile.close()

        outfile = open(feature_folder + "/encoder_" + task + "_" + lang + ".pickle",'wb')
        pickle.dump(encoder_gender,outfile)
        outfile.close()

        preds = {}
        print('Evaluation on task and language: ', task, lang )
        #finally lets train the classifcation models and test them on the evaluation sets
        for target, vals in label_vectors.items():

            if target == task or (task=='bot' and target=="type"):
                train_labels = vals[0]
                test_labels = vals[1]
                
                #Train the model
                clf = LogisticRegression(C=1e2, fit_intercept=False)
                clf.fit(train_features,train_labels)
                joblib.dump(clf, feature_folder + '/trained_LR_' + task + "_" + lang + '.pkl')
                
                #Generate predictions on the evaluation set
                predictions = clf.predict(test_features)

                #Get the accuracy
                accuracy = accuracy_score(predictions,test_labels)
                logging.info("{} Performed with {}".format(target, accuracy))
                preds[target] = accuracy
        total_score = 1/np.sum([1/sc for sc in preds.values()])

Evaluation on task and language:  bot en


14-Jan-21 12:21:49 - type Performed with 0.9016129032258065


Evaluation on task and language:  gender en


14-Jan-21 12:21:51 - gender Performed with 0.7951612903225806


Evaluation on task and language:  bot es


14-Jan-21 12:21:54 - type Performed with 0.8804347826086957


Evaluation on task and language:  gender es


14-Jan-21 12:21:55 - gender Performed with 0.6695652173913044
