In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/NLP

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/NLP


In [2]:
!pip install textstat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import SyllableTokenizer
from textstat import flesch_reading_ease, gunning_fog
import string

In [4]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.cluster import KMeans
from sklearn.metrics import v_measure_score

In [6]:
def extract_stylistic_features(text):
    epsilon = 0.001

    # Sentence length feature
    sentences = sent_tokenize(text)
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    avg_sentence_length = sum(sentence_lengths) / (len(sentence_lengths) + epsilon)

    # Vocabulary richness feature
    words = word_tokenize(text)
    unique_words = set(words)
    vocabulary_richness = len(unique_words) / (len(words) + epsilon)

    # Punctuation usage feature
    num_punctuation = sum(1 for char in text if char in string.punctuation)

    # Syntactic patterns feature
    tokenized_words = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokenized_words)
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('NN'))
    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('VB'))
    adjective_count = sum(1 for word, pos in pos_tags if pos.startswith('JJ'))
    adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))

    # Readability measures
    flesch_score = flesch_reading_ease(text)
    gunning_fog_score = gunning_fog(text)


    return torch.tensor((avg_sentence_length, vocabulary_richness, num_punctuation, noun_count, verb_count, adjective_count, adverb_count, flesch_score, gunning_fog_score))


In [7]:
data = pd.read_csv("blogtext.csv")
sample = data[0:6593]
styles = torch.zeros((len(sample.text), 9))

In [8]:
pretrained = True
if pretrained:
    styles = torch.load("styles.pt")
else:
    for i in tqdm(range(len(sample.text))):
      styles[i] = extract_stylistic_features(sample.text[i])
    torch.save(styles, "styles.pt")

In [9]:
normalized = F.normalize(styles, dim=0)

In [10]:
kmeans = KMeans(n_clusters = 10).fit(normalized)
clusters = kmeans.labels_
v_measure_score(sample.id, clusters)



0.13320694019341872