In [2]:
## Imports
import pandas as pd
import numpy as np
import time
import re
import nltk
import matplotlib as plt

import spacy
from collections import Counter

np.random.seed(3000)

print("Imports completed")


Imports completed


In [2]:
LANGUAGE = 'en'
SOURCE_DATASET = 'ew-sew'

OUTPUT_DATASET_SIZE = 5000
NUM_OF_CANDIDATE_SETS = 10

SHORT_OUTPUT_DATASET_SIZE = 200

filter_out_problematic_instances = True
clean_up = True
measure_syntactic_diversity = True
measure_lexical_diversity = True


execution_start_time = time.time()
execution_start_time_string = time.strftime("%m_%d_%Hh")

output_file_name = "data_subset_" + LANGUAGE + "_" + str(OUTPUT_DATASET_SIZE) + "_from_" + SOURCE_DATASET + "_" + execution_start_time_string
print("Output file name is going to be: ", output_file_name)

Output file name is going to be:  data_subset_en_5000_from_ew-sew_08_24_18h


In [3]:
## Get the aligned sentences

original_sentences = '../data_orig/en/normal.aligned'
simple_sentences = '../data_orig/en/simple.aligned'

In [4]:
# Read the first TSV file
original_df = pd.read_csv(original_sentences, sep='\t', header=None, names=['unimportant_string_1', 'unimportant_number_1', 'original_sentence'],  encoding='latin1')

# Read the second TSV file
simple_df = pd.read_csv(simple_sentences, sep='\t', header=None, names=['unimportant_string_2', 'unimportant_number_2', 'simple_sentence'],  encoding='latin1')

# Extract the relevant columns
original_column = original_df[['original_sentence']]
simple_column = simple_df[['simple_sentence']]

# Combine the DataFrames into one
result_df = pd.concat([original_column, simple_column], axis=1)

df = result_df
df

Unnamed: 0,original_sentence,simple_sentence
0,It is the county seat of Alfalfa County .,It is the county seat of Alfalfa County .
1,"Cherokee is a city in Alfalfa County , Oklahom...",Cherokee is a city of Oklahoma in the United S...
2,Skateboard decks are usually between 28 and 33...,Skateboard decks are normally between 28 and 3...
3,The underside of the deck can be printed with ...,The bottom of the deck can be printed with a d...
4,This was created by two surfers ; Ben Whatson ...,The longboard was made by two surfers ; Ben Wh...
...,...,...
167684,Caffiers is a commune in the Pas-de-Calais dep...,Caffiers is a commune . It is found in the reg...
167685,The population was 549 at the 2000 census .,549 people were living in Orange as of 2000 .
167686,"Orange is a town in Juneau County , Wisconsin ...",Orange is a town of Juneau County in the state...
167687,Orainville is a commune in the Aisne departmen...,Orainville is a commune . It is found in the r...


In [5]:
if filter_out_problematic_instances:

    def contains_many_special_chars(text, threshold_base=3, threshold_increment=0.1):
        threshold = threshold_base + threshold_increment * len(text)
        special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
        return special_chars > threshold
            
    def contains_many_digits(text, threshold_base=8, threshold_increment=0.1):
        threshold = threshold_base + threshold_increment * len(text)
        digits = len(re.findall(r'\d', text))
        return digits > threshold
    
    def contains_artifacts(text):
        # Define a regex pattern to match common artifact patterns
        # This pattern checks for unusual characters or sequences that are likely artifacts
        artifact_pattern = re.compile(r'[^\x20-\x7E]')  # Matches non-printable or unusual ASCII characters
        return bool(artifact_pattern.search(text))
    
    filtered_indices = []
    counter = 0
    sents_w_artifacts = 0
    sents_w_many_spec_chars = 0
    sents_w_many_digits = 0
    
    
    # Iterate over the DataFrame and find rows with artifacts
    for index, row in result_df.iterrows():
        sentence = row['original_sentence']
        counter += 1
        if contains_artifacts(sentence):
            sents_w_artifacts += 1
            filtered_indices.append(index)
            if counter % 1500 == 0:
                print("|| Artifact in sentence " + str(index) + " ||")
                print(sentence)

        elif contains_many_digits(sentence):
            sents_w_many_digits += 1
            filtered_indices.append(index)
            if counter % 50 == 0:
                print("|| Too many digits in setence " + str(index) + " ||")
                print(sentence)
            
        else:
            if contains_many_special_chars(sentence):
                sents_w_many_spec_chars += 1
                filtered_indices.append(index)
                if counter % 100 == 0:
                    print("|| Too many special characters in setence " + str(index) + " ||")
                    print(sentence)         
            
            
    
    # Create a new DataFrame excluding the rows with artifacts
    clean_df = result_df.drop(index=filtered_indices)
    df = clean_df
    
    print("Filtering complete")
    print("Filtered out ", sents_w_artifacts, " sentences containing artifacts.")
    print("Then filtered out ", sents_w_many_digits, " sentences containing many digits.")
    print("Then filtered out ", sents_w_many_spec_chars, " sentences containing too many special characters.")



|| Artifact in sentence 2999 ||
Diamond meets Riley at Iron Island , and Byron asks Riley to help Diamond sharpen his PokÃ mon battle skills before Diamond goes to Lake Verity .
|| Too many digits in setence 4399 ||
In the county the population was spread out with 21.70 % under the age of 18 , 6.80 % from 18 to 24 , 29.40 % from 25 to 44 , 26.00 % from 45 to 64 , and 16.20 % who were 65 years of age or older .
|| Too many special characters in setence 7799 ||
There are small Mandaean diaspora populations in Sweden -LRB- c. 5,000 -RRB- , Australia -LRB- c. 3,500 as of 2006 -RRB- , the USA -LRB- c. 1,500 -RRB- , the UK -LRB- c. 1,000 -RRB- , and Canada .
|| Too many special characters in setence 8399 ||
The Tengwar -LRB- Quenya for `` letters '' ; singular tengwa `` letter '' -RRB- are an artificial script created by J. R. R. Tolkien .
|| Too many special characters in setence 28199 ||
Cobalt -LRB- II -RRB- oxide -LRB- CoO -RRB- and cobalt -LRB- II -RRB- fluoride -LRB- CoF2 -RRB- can al

In [6]:
print(len(result_df))
print(len(clean_df))
print(len(result_df)-len(clean_df))
print(len(clean_df)/len(result_df))

167689
148072
19617
0.8830155824174514


In [7]:
if clean_up:

    def replace_LRB_RRB(text):
        # Replace "-LRB-" with "(" and "-RRB-" with ")"
        text = text.replace('-LRB- ', '(')
        text = text.replace(' -RRB-', ')')
        return text

    def replace_wrong_quotation_marks(text):
        # Replace patterns like `` word(s) '' with "word(s)"
        return re.sub(r'``\s*(.*?)\s*\'\'', r'"\1"', text)
    
    def remove_spaces_before_punctuation(text):
        # Regex pattern to match a space between a word-like string and a punctuation mark
        return re.sub(r'(\w)\s+([.,!?;:\'])', r'\1\2', text)


    df['original_sentence'] = df['original_sentence'].apply(replace_LRB_RRB)
    df['original_sentence'] = df['original_sentence'].apply(replace_wrong_quotation_marks)
    df['original_sentence'] = df['original_sentence'].apply(remove_spaces_before_punctuation)
    
    print("Clean up performed.")


Clean up performed.


In [8]:
counter = 0
for sentence in df['original_sentence']:
    counter += 1
    if counter % 5000 == 0:
        print(sentence)

Avatar: The Last Airbender (known as Avatar: The Legend of Aang in Europe) is a video game based on the animated television series of the same name for Game Boy Advance, Microsoft Windows, Nintendo GameCube, Nintendo DS, PlayStation 2, PlayStation Portable, Wii, and XBox.
He has played for Japan national team.
The first journey took 12 minutes northbound and 10 minutes to return, even allowing for the horse-drawn vehicles also using the roads on the overground part of the route.
It is situated on the Piazza di Santa Croce, about 800 metres south east of the Duomo.
Its natural habitat is subtropical or tropical moist lowland forests.
Edmund accompanies everyone, except Susan, into Aslan's country.
After Monroe's death, DiMaggio claimed her body and arranged her funeral.
This genotype-phenotype distinction was proposed by Wilhelm Johannsen in 1911 to make clear the difference between an organism's heredity and what that heredity produces.
In 1950, General Toftoy brought German rocket sci

In [9]:
def get_dependency_tree(doc):
    # Get the dependency structure in terms of (head, relation, dependent)
    return [(token.head.dep_, token.dep_, token.dep_) for token in doc if token.head != token]

def calculate_tree_diversity(parse_trees):
    # Flatten the list of trees and count unique structures
    flattened_trees = [tuple(tree) for trees in parse_trees for tree in trees]
    tree_counter = Counter(flattened_trees)
    
    # Diversity score: higher score means more unique structures
    diversity_score = len(tree_counter) / sum(tree_counter.values())
    
    return diversity_score

In [10]:
# Load the English model
nlp = spacy.load('en_core_web_trf')

candidate_sets = [df["original_sentence"].sample(n=OUTPUT_DATASET_SIZE, replace=False) for _ in range(NUM_OF_CANDIDATE_SETS)]

sets_w_scores = []

start_time_all = time.time()

for idx, candidate_set in enumerate(candidate_sets):

    start_time = time.time()

    print("Assessing set ", idx+1, "/", NUM_OF_CANDIDATE_SETS, " of len ", OUTPUT_DATASET_SIZE)

    # List of sentences
    sentences = candidate_set

    print(type(sentences))

    # Parse the sentences
    docs = [nlp(sentence) for sentence in sentences]

    print(len(docs))

    tree_diversity_score = None
    std_clause_density_avg = None
    std_token_prob_avg = None



    if measure_syntactic_diversity:
        # Extract parse trees for all sentences
        parse_trees = [get_dependency_tree(doc) for doc in docs]
    
        elapsed_time_trees = time.time() - start_time
        print("Done with the parse_trees after ", time.strftime("%H:%M:%S", time.gmtime(elapsed_time_trees)) + f".{int((elapsed_time_trees % 1) * 1000):03d}")
    
        # Calculate syntactic diversity score
        
        tree_diversity_score = calculate_tree_diversity(parse_trees)

    
        clause_density_avg_list = []
        for doc in docs:
            num_tokens = len([token for token in doc if token.is_alpha])
            num_clauses = sum(1 for token in doc if token.dep_ in ('ROOT', 'csubj', 'csubjpass', 'advcl', 'relcl', 'xcomp', 'ccomp'))
            clause_density = num_clauses / num_tokens if num_tokens else 0
            clause_density_avg_list.append(clause_density)
        
        std_clause_density_avg = np.std(clause_density_avg_list)

    if measure_lexical_diversity:
        token_prob_avg_list = []
        for doc in docs:
            # Calculate lexical diversity
            token_freqs = [token.prob for token in doc if token.is_alpha]  # Use token.prob as a proxy for frequency score
            token_prob_avg = np.mean(token_freqs) if token_freqs else 0
            token_prob_avg_list.append(token_prob_avg)
        
        std_token_prob_avg = np.std(token_prob_avg_list)
        
    
    sets_w_scores.append((idx, tree_diversity_score, std_clause_density_avg, std_token_prob_avg))
    
    elapsed_time_set = time.time() - start_time
    print("Finished set ",idx, "after ", time.strftime("%H:%M:%S", time.gmtime(elapsed_time_set)) + f".{int((elapsed_time_set % 1) * 1000):03d}")

elapsed_time_all = time.time() - start_time_all
print("Finished all after ", time.strftime("%H:%M:%S", time.gmtime(elapsed_time_all)) + f".{int((elapsed_time_all % 1) * 1000):03d}")
print(sets_w_scores)

  model.load_state_dict(torch.load(filelike, map_location=device))


Assessing set  1 / 10  of len  5000
<class 'pandas.core.series.Series'>


  with torch.cuda.amp.autocast(self._mixed_precision):


5000
Done with the parse_trees after  00:05:09.497
Finished set  0 after  00:05:09.835
Assessing set  2 / 10  of len  5000
<class 'pandas.core.series.Series'>
5000
Done with the parse_trees after  00:05:05.195
Finished set  1 after  00:05:05.552
Assessing set  3 / 10  of len  5000
<class 'pandas.core.series.Series'>
5000
Done with the parse_trees after  00:05:06.024
Finished set  2 after  00:05:06.365
Assessing set  4 / 10  of len  5000
<class 'pandas.core.series.Series'>
5000
Done with the parse_trees after  00:05:03.510
Finished set  3 after  00:05:03.851
Assessing set  5 / 10  of len  5000
<class 'pandas.core.series.Series'>
5000
Done with the parse_trees after  00:05:06.340
Finished set  4 after  00:05:06.688
Assessing set  6 / 10  of len  5000
<class 'pandas.core.series.Series'>
5000
Done with the parse_trees after  00:05:08.410
Finished set  5 after  00:05:08.761
Assessing set  7 / 10  of len  5000
<class 'pandas.core.series.Series'>
5000
Done with the parse_trees after  00:05:06

In [11]:
# Weights
weights = [0.3, 0.3, 0.4]

# Function to calculate the weighted sum for a tuple
def weighted_sum(tup):
    return weights[0] * tup[1] + weights[1] * tup[2] + weights[2] * tup[3]

# Find the tuple with the maximum weighted sum
best_tuple = max(sets_w_scores, key=weighted_sum)

print(sets_w_scores)
print(best_tuple)

[(0, 0.0055057146368440455, 0.0511210378525873, 0.2828144267890166), (1, 0.005532961708260513, 0.051834312320686166, 0.0), (2, 0.0057512339087369985, 0.05106544069903479, 0.0), (3, 0.005712758493969866, 0.054402878822244996, 0.0), (4, 0.005690493871454914, 0.0512733907743618, 0.0), (5, 0.005548949663099485, 0.04929191436064988, 0.0), (6, 0.005772947466178058, 0.05087510062305802, 0.0), (7, 0.0055717916663217066, 0.051599956712829874, 0.0), (8, 0.0056586084510814975, 0.05079290043214167, 0.0), (9, 0.00557577763070814, 0.051107319073373965, 0.0)]
(0, 0.0055057146368440455, 0.0511210378525873, 0.2828144267890166)


In [12]:
best_set_of_sentences = candidate_sets[best_tuple[0]]
print(best_set_of_sentences)

57224     Toxicofera (Greek for "those who bear toxins")...
117533    The islands of the Caribbean Sea, collectively...
3709      Some websites do not allow typographic quotati...
50928     Ecological yield is the harvestable population...
126524    When he was 16-years-old, Davey's mother left ...
                                ...                        
103304    The Act of Parliament which founded board scho...
100714    The report discusses the effect of global warm...
65057     North American herbivorous dinosaurs from this...
121457           The winning bid will be announced in 2015.
117432    In the United Kingdom, white Christmases were ...
Name: original_sentence, Length: 5000, dtype: object


In [None]:
output_file_name_w_indices = output_file_name+"(incl indices)"

In [13]:
with open(output_file_name, "w") as file:
    counter = 0
    for index, sentence in best_set_of_sentences.items():
        counter += 1
        file.write(f"{index}|{sentence}\n\n")
        if counter % 50 == 0:
            print(f"{sentence}\n\n")

"""
with open(output_file_name_w_indices, "w") as file:
    counter = 0
    for index, sentence in best_set_of_sentences.items():
        counter += 1
        file.write(f"{index}|{sentence}\n\n")
        if counter % 50 == 0:
            print(f"{index}|{sentence}\n\n")
            """
        

52638|The season finale including the announcement of the winner and a live reunion of all the season's contestants at the Ed Sullivan Theater in New York City, on David Letterman's Late Show stage which was decorated to look like the set in the Amazon.


75303|The song peaked at number six on the Billboard alternative rock chart, making it the band's second top 10 single (the first being previous single "Prayer of the Refugee") and highest charting single from the album.


19986|Tilly-sur-Seulles is a commune in the Calvados department in the Basse-Normandie region in northwestern France.


19509|Troy Island Mell, of IGN, felt that the story "would (not) be anywhere near as good as it is without its ability to create such strong characters" .


85224|A very small portion of the historic Oakwood Cemetery, added to the National Register of Historic Places in 1984, and burial place of Samuel Wilson, a possible namesake of Uncle Sam, resides within the northwestern part of the town.


641

In [14]:
short_output_file_name = "data_subset_" + LANGUAGE + "_" + str(SHORT_OUTPUT_DATASET_SIZE) + "_from_" + SOURCE_DATASET + "_" + execution_start_time_string

short_sample = best_set_of_sentences.sample(n=SHORT_OUTPUT_DATASET_SIZE, replace=False)

with open(short_output_file_name, "w") as file:
    counter = 0
    for index, sentence in short_sample.items():
        counter += 1
        file.write(f"{index}|{sentence}\n\n")
        if counter % 100 == 0:
            print(f"{index}|{sentence}\n\n")


125103|Dronjak had already composed the song "Steel Meets Steel" , which was later included on HammerFall's debut album.


128570|Six years after they closed the doors of their final iteration restaurateur Oscar Tucci opened a revived Delmonico's at 2 South William Street, which stayed in business until 1977.




In [15]:
execution_elapsed_time = time.time() - execution_start_time
print("Code execution completed after ", time.strftime("%H:%M:%S", time.gmtime(execution_elapsed_time)) + f".{int((execution_elapsed_time % 1) * 1000):03d}")


Code execution completed after  00:51:30.860
