In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_dataset_path = "../bert_double/data/tos_clauses_train.csv"
test_dataset_path = "../bert_double/data/tos_clauses_dev.csv"

In [3]:
train_df = pd.read_csv(train_dataset_path, header=0)
test_df = pd.read_csv(test_dataset_path, header=0)

In [4]:
train_df.head()

Unnamed: 0,label,sentences
0,0,content license and intellectual property rights
1,0,reactivated skype credit is not refundable .
2,1,spotify may change the price for the paid subs...
3,0,the term of your licenses under this eula shal...
4,0,the arbitrator may award declaratory or injunc...


In [5]:
import contractions
import unicodedata
def to_lower(data: pd.Series):
    return data.str.lower()

def remove_accented_characters(data: pd.Series):
    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))

def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"\d+;", " ", regex=True)

def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

def remove_html_and_url(data: pd.Series):
    data.str.replace(r"\d+;", " ", regex=True)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)
    return data

def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)
                     
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)

def fix_contractions(data: pd.Series):
    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])
    return data.apply(contraction_fixer)

def remove_special_words(data: pd.Series):
    return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)

def cleaning(df):
    data_cleaning_pipeline = {
        "sentences": [
            to_lower,
            remove_special_words,
            remove_accented_characters,
            remove_html_encodings,
            remove_html_tags,
            remove_url,
            fix_contractions,
            remove_non_alpha_characters,
            remove_extra_spaces,
        ]
    }

    cleaned_data = df.copy()

    # Process all the cleaning instructions
    for col, pipeline in data_cleaning_pipeline.items():
      # Get the column to perform cleaning on
      temp_data = cleaned_data[col].copy()

      # Perform all the cleaning functions sequencially
      for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

      # Replace the old column with cleaned one.
      cleaned_data[col] = temp_data.copy()

    return cleaned_data

In [6]:
train_df = cleaning(train_df)
test_df = cleaning(test_df)

Starting: to_lower
Ended: to_lower
Starting: remove_special_words
Ended: remove_special_words
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces
Starting: to_lower
Ended: to_lower
Starting: remove_special_words
Ended: remove_special_words
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [7]:
train_df.head()

Unnamed: 0,label,sentences
0,0,content license and intellectual property rights
1,0,reactivated skype credit is not refundable
2,1,spotify may change the price for the paid sub...
3,0,the term of your licenses under this eula sha...
4,0,the arbitrator may award declaratory or injun...


In [8]:
test_df.head()

Unnamed: 0,label,sentences
0,0,uber reserves the right to withhold or deduct...
1,0,niantic s failure to enforce any right or pro...
2,0,14 3 if you feel that any member you interact...
3,0,blizzard entertainment has the right to obtai...
4,0,myfitnesspal does not lrb i rrb guarantee the...


In [9]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [10]:
import numpy as np
def get_embeddings(sentences):
    vectors = []
    for sentence in sentences:
        clause_vector = []
        for word in sentence.split(' '):
            try:
                clause_vector.append(wv[word])
            except KeyError:
                continue
        if len(clause_vector) > 0:
            vectors.append(np.mean(clause_vector, axis=0))
    return vectors

In [11]:
embeddings_train = get_embeddings(train_df['sentences'].tolist())
embeddings_test = get_embeddings(test_df['sentences'].tolist())

In [12]:
len(embeddings_train), len(embeddings_test)

(7531, 1883)

In [13]:
import pickle

def save_embeddings(file_path, embeddings):
    with open(file_path, 'wb') as f:
        pickle.dump(embeddings, f)

In [14]:
save_embeddings('word2vec_train.pkl', embeddings_train)
save_embeddings('word2vec_test.pkl', embeddings_test)