In [1]:
## Imports
import pandas as pd

import spacy
from collections import Counter



**Read files**

In [2]:
## Get the aligned sentences

original_sentences = '../../../datasets_original/ew-sew_v2_sentence-aligned/sentence-aligned.v2/normal.aligned'
simple_sentences = '../../../datasets_original/ew-sew_v2_sentence-aligned/sentence-aligned.v2/simple.aligned'

In [3]:
# Read the first TSV file
original_df = pd.read_csv(original_sentences, sep='\t', header=None, names=['unimportant_string_1', 'unimportant_number_1', 'original_sentence'])

# Read the second TSV file
simple_df = pd.read_csv(simple_sentences, sep='\t', header=None, names=['unimportant_string_2', 'unimportant_number_2', 'simple_sentence'])

# Extract the relevant columns
original_column = original_df[['original_sentence']]
simple_column = simple_df[['simple_sentence']]

# Combine the DataFrames into one
result_df = pd.concat([original_column, simple_column], axis=1)

result_df


Unnamed: 0,original_sentence,simple_sentence
0,It is the county seat of Alfalfa County .,It is the county seat of Alfalfa County .
1,"Cherokee is a city in Alfalfa County , Oklahom...",Cherokee is a city of Oklahoma in the United S...
2,Skateboard decks are usually between 28 and 33...,Skateboard decks are normally between 28 and 3...
3,The underside of the deck can be printed with ...,The bottom of the deck can be printed with a d...
4,This was created by two surfers ; Ben Whatson ...,The longboard was made by two surfers ; Ben Wh...
...,...,...
167684,Caffiers is a commune in the Pas-de-Calais dep...,Caffiers is a commune . It is found in the reg...
167685,The population was 549 at the 2000 census .,549 people were living in Orange as of 2000 .
167686,"Orange is a town in Juneau County , Wisconsin ...",Orange is a town of Juneau County in the state...
167687,Orainville is a commune in the Aisne departmen...,Orainville is a commune . It is found in the r...


*Get 50 original sentences*

In [4]:
for s in result_df["original_sentence"][:50]:
    print(s)

It is the county seat of Alfalfa County .
Cherokee is a city in Alfalfa County , Oklahoma , United States .
Skateboard decks are usually between 28 and 33 inches long .
The underside of the deck can be printed with a design by the manufacturer , blank , or decorated by any other means .
This was created by two surfers ; Ben Whatson and Jonny Drapper .
Some of them have special materials that help to keep the deck from breaking : such as fiberglass , bamboo , resin , Kevlar , carbon fiber , aluminum , and plastic .
`` Old school '' boards -LRB- those made in the 1970s â `` 80s or modern boards that mimic their shape -RRB- are generally wider and often have only one kicktail .
One of the first deck companies was called `` Drapped '' taken from Jonny 's second name .
Grip tape , when applied to the top surface of a skateboard , gives a skater 's feet grip on the deck .
Modern decks vary in size , but most are 7 to 10.5 inches wide .
Variants of the 1970s often have little or no concavity 

In [10]:
# Load the English model
nlp = spacy.load('en_core_web_trf')

def get_dependency_tree(doc):
    # Get the dependency structure in terms of (head, relation, dependent)
    return [(token.head.dep_, token.dep_, token.dep_) for token in doc if token.head != token]

def calculate_tree_diversity(parse_trees):
    # Flatten the list of trees and count unique structures
    flattened_trees = [tuple(tree) for trees in parse_trees for tree in trees]
    tree_counter = Counter(flattened_trees)
    
    # Diversity score: higher score means more unique structures
    diversity_score = len(tree_counter) / sum(tree_counter.values())
    
    return diversity_score

Parse Tree Diversity Score: 0.1565778853914447


In [25]:
candidate_sets = [result_df["original_sentence"].sample(n=1000, replace=False) for _ in range(50)]

sets_w_diversity_score = []

for num, candidate_set in enumerate(candidate_sets):

    # List of sentences
    sentences = candidate_set

    print(len(sentences))

    # Parse the sentences
    docs = [nlp(sentence) for sentence in sentences]

    print(len(docs))

    # Extract parse trees for all sentences
    parse_trees = [get_dependency_tree(doc) for doc in docs]

    print("done with the parse_trees")

    # Calculate diversity score
    diversity_score = calculate_tree_diversity(parse_trees)
    sets_w_diversity_score.append((num, diversity_score))
    print("Done with candidate ", num+1)
    
max_diversity = max(sets_w_diversity_score, key=lambda tup: tup[1])

print(sets_w_diversity_score)
print(max_diversity)

    


1000
1000
done with the parse_trees
Done with candidate  1
1000
1000
done with the parse_trees
Done with candidate  2
1000
1000
done with the parse_trees
Done with candidate  3
1000


KeyboardInterrupt: 