In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import torch
import torch.nn as nn
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from collections import Counter

In [2]:
df = pd.read_csv('blogtext.csv')

# Calculate the number of words in each blog post
text_len = df.text.apply(lambda x: len(RegexpTokenizer(r'\w+').tokenize(x))).to_numpy()

# Combine the text length with the dataset
df['text_len'] = text_len

# Remove posts with less than 100 words
df = df[df['text_len'] > 100]

# Get the number of posts for each author
authors = df['id'].to_numpy()
author_id, counts = np.unique(authors, return_counts=True)

# Remove authors with less than 10 posts
valid_authors = author_id[counts > 10]
df = df[df['id'].isin(valid_authors)]

# Strip text
df['text'] = df['text'].str.strip()

In [3]:
punctuation = ['.', ',', '!', '?', ':', ';', '-', '(', ')', '[', ']', '{', '}', "'", '"']
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
short_word_len = 4

# We calculate the stylometric features
word_len = np.zeros(len(df))
sentence_len = np.zeros(len(df))
short_words = np.zeros(len(df))
digit_prop = np.zeros(len(df))
captialized_prop = np.zeros(len(df))
letter_freq = np.zeros((len(df), len(letters)))
digit_freq = np.zeros((len(df), len(digits)))
punctuation_freq = np.zeros((len(df), len(punctuation)))
hapax_legomena = np.zeros(len(df))

for i in tqdm(range(len(df))):
    text = df['text'].iloc[i]
    sentences = nltk.sent_tokenize(text)
    words = RegexpTokenizer(r'\w+').tokenize(text)

    word_lengths = [len(word) for word in words]

    if len(word_lengths) == 0:
        print(f"Empty word length in post {i}")
        break

    if len([len(sentence.split(' ')) for sentence in sentences]) == 0:
        print(f"Empty sentence in post {i}")
        break

    word_len[i] = np.mean(word_lengths)
    short_words[i] = np.sum([1 for word in words if len(word) < short_word_len])
    sentence_len[i] = np.mean([len(sentence.split(' ')) for sentence in sentences])

    character_counts = Counter(list(text.lower()))

    for j, letter in enumerate(letters):
        if letter in character_counts:
            letter_freq[i][j] = character_counts[letter]
        else:
            letter_freq[i][j] = 0

    for j, digit in enumerate(digits):
        if digit in character_counts:
            digit_freq[i][j] = character_counts[digit]
        else:
            digit_freq[i][j] = 0

    for j, punct in enumerate(punctuation):
        if punct in character_counts:
            punctuation_freq[i][j] = character_counts[punct]
        else:
            punctuation_freq[i][j] = 0

    letter_freq[i] /= np.sum(letter_freq[i]) + 1e-10
    digit_freq[i] /= np.sum(digit_freq[i]) + 1e-10
    punctuation_freq[i] /= np.sum(punctuation_freq[i]) + 1e-10
    
    hapax_legomena[i] = len([word for word, count in Counter(words).items() if count == 1])

    text_len = df['text_len'].iloc[i]

    digit_prop[i] = np.sum([1 for word in words if word.isdigit()]) / text_len
    captialized_prop[i] = np.sum([1 for word in words if word[0].isupper()]) / text_len

100%|██████████| 316594/316594 [02:14<00:00, 2350.58it/s]


In [4]:
mean_word_len = np.mean(word_len)
mean_sentence_len = np.mean(sentence_len)
mean_short_words = np.mean(short_words)
mean_hapax_legomena = np.mean(hapax_legomena)

std_word_len = np.std(word_len)
std_sentence_len = np.std(sentence_len)
std_short_words = np.std(short_words)
std_hapax_legomena = np.std(hapax_legomena)

# Normalize the features
word_len = (word_len - mean_word_len) / std_word_len
sentence_len = (sentence_len - mean_sentence_len) / std_sentence_len
short_words = (short_words - mean_short_words) / std_short_words
hapax_legomena = (hapax_legomena - mean_hapax_legomena) / std_hapax_legomena

In [5]:
# Add the features to the dataframe
df['word_len'] = word_len
df['sentence_len'] = sentence_len
df['short_words'] = short_words
df['digit_prop'] = digit_prop
df['captialized_prop'] = captialized_prop
df['hapax_legomena'] = hapax_legomena

# Add the letter frequency features to the dataframe with the column names
for i in range(len(letters)):
    df[f'letter_freq_{letters[i]}'] = letter_freq[:, i]
for i in range(len(digits)):
    df[f'digit_freq_{digits[i]}'] = digit_freq[:, i]
for i in range(len(punctuation)):
    df[f'punctuation_freq_{punctuation[i]}'] = punctuation_freq[:, i]

In [6]:
# Remove stop words
stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stop_words])
df['text'] = df['text'].apply(remove_stop_words)

In [7]:
# Drop the features that are not needed
df = df.drop(columns=['gender', 'age', 'topic', 'sign', 'date'])

# Split the dataset into training and testing sets by author
authors = df['id'].unique()

np.random.seed(42)
np.random.shuffle(authors)

train_size = 0.8
train_authors = authors[:int(len(authors) * train_size)]
test_authors = authors[int(len(authors) * train_size):]

train_df = df[df['id'].isin(train_authors)]
test_df = df[df['id'].isin(test_authors)]

# Save the training and testing sets
train_df.to_csv('blogtext_train.csv', index=False)
test_df.to_csv('blogtext_test.csv', index=False)

In [8]:
# # Get top 50 authors with the most posts
# top_authors = df['id'].value_counts().nlargest(50).index

# # Only keep the posts from the top 50 authors
# df_50 = df[df['id'].isin(top_authors)]

# # Create training and testing sets for the top 50 authors
# # Shuffle the dataframe
# df_50 = df_50.sample(frac=1, random_state=42).reset_index(drop=True)
# # Split the dataset into training and testing sets
# train_size = 0.8
# train_df_50 = df_50[:int(len(df_50) * train_size)]
# test_df_50 = df_50[int(len(df_50) * train_size):]

# # Save the training and testing sets
# train_df_50.to_csv('blogtext_train_50.csv', index=False)
# test_df_50.to_csv('blogtext_test_50.csv', index=False)

In [9]:
# print(mean_word_len, std_word_len)
# print(mean_sentence_len, std_sentence_len)
# print(mean_short_words, std_short_words)
# print(mean_hapax_legomena, std_hapax_legomena)