# Imports

In [None]:
!pip install sentence-transformers
!pip install datasets

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pathlib import Path
import random
import os
import torch
import pandas as pd
import json
import gzip

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/My Drive/CommonLit/External Data"

/content/drive/My Drive/CommonLit/External Data


# Useful functions

In [None]:
# The code is an attempt to reproduce what the winner of the Competition did with some improvements.

In [None]:
def encode_sentences(sentences, file_name, model_name='paraphrase-TinyBERT-L6-v2' ):

  """
  Creates the embeddings of the sentences using SentenceTransformer.

  Takes as input a list of sentences, a file name, and the model used to create the embedding and save the embedding
  in a folder called encoded_sentences.

  Args:
      sentences (list): list of sentences we want to get the embedding from
      file_name (str): name of the file
      model_name (str): name of the mdoel from hugging face.
  """

  dir_name = Path("encoded_sentences")
  # creates a new directory if it does not exist
  dir_name.mkdir(parents = True, exist_ok = True)

  model = SentenceTransformer(model_name)
  # get the embedding of all sentences
  encoded_sentences = model.encode(sentences, convert_to_tensor=True)

  full_path = os.path.join("encoded_sentences", file_name + '.pt')

  # save the file
  with open(full_path, 'wb') as f:
    torch.save(encoded_sentences, f)


In [None]:
def save_data(data, file_name, dir_name = "preprocessed_data"):
  """
  Save the sentences into a folder called dir_name.

  Takes as input a list of sentences (data), a file anme and the directory name. 

  Args:
      data (dataframe): dataframe of one column with all the sentences
      file_name (str): name of the file
      dir_name (str): name of the folder

  """
  dir_name = Path(dir_name)
  # creates a new directory if it does not exist
  dir_name.mkdir(parents = True, exist_ok = True)
  full_path = os.path.join(dir_name, file_name)
  # save the dataframe in csv format
  data.to_csv(full_path)

In [None]:
def get_simple_wiki():

    """
    Unzip gzip format and creates a list with all the sentences of simple wiki.

    Returns:
      List of sentences
    """
    simplewiki_path ='simplewiki-2020-11-01.jsonl.gz'
    passages = []
    with gzip.open(simplewiki_path, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            data = json.loads(line.strip())
            passages.extend(data['paragraphs'])
    return passages

# Preparing Wikipedia Data

In [None]:
# load the dataset from hugging face
wiki_dataset = load_dataset('wikitext', 'wikitext-103-v1', split = "train")
# filter out sentences with more than 1100 characters and under 700 characters to reproduce the length of the excerpt from the training set
wiki_dataset = wiki_dataset.filter(lambda data: len(data["text"])>700 and len(data["text"])<1100)
# makes it ready to use in dataframe format
wiki_df = wiki_dataset.to_pandas()
wiki_sentences = wiki_df.text.tolist()
save_data(wiki_df, "wikipedia.csv")
encode_sentences(wiki_sentences, file_name = "wikipedia")

# Preparing Simple Wiki Data

In [None]:
simplewiki_dataset = get_simple_wiki()
simplewiki_filtered = [text for text in simplewiki_dataset if (len(text)<1100 and len(text)>700)]
simplewiki_df = pd.DataFrame(simplewiki_filtered)
save_data(simplewiki_df, "simple_wikipedia.csv")
encode_sentences(simplewiki_filtered, file_name = "simple_wikipedia")

# Preparing OneStop English Corpus

In [None]:
# load the dataset from hugging face
onestop_data = load_dataset('onestop_english')
# only use the training set. Could have used both
onestop_data = onestop_data['train']
# makes it ready to use in dataframe format
onestop_df = onestop_data.to_pandas()
onestop_filtered = list(onestop_df.text)
save_data(onestop_df, "onestop.csv")
encode_sentences(onestop_filtered, file_name = "onestop")

# Preparing CBT data

In [None]:
# load the dataset from hugging face
dataset = load_dataset("cbt", "CN")
# makes it ready to use in dataframe format
df = dataset["train"].to_pandas()
df["sentences"] = df.sentences.apply(lambda x: " ".join(x))
df = df.drop_duplicates(subset = "sentences")
df["sentences"] = df.sentences.apply(lambda x: x[:1100])
cbt_dataset = df[["sentences"]]
cbt_dataset.columns = ["text"]
cbt_filtered = [text for text in cbt_dataset.text.values if (len(text)<1200 and len(text)>700)]
cbt_dataset = pd.DataFrame(cbt_filtered)
cbt_dataset.columns = ["text"]
save_data(cbt_dataset, "cbt.csv")
encode_sentences(cbt_filtered, file_name = "cbt")