#### SVM with Leave One Document Out Cross Validation

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)
import pandas as pd
import numpy as np
import re
import string
import os
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score
import nltk
from nltk.corpus import stopwords
import warnings
import contractions
import unicodedata
warnings.filterwarnings("ignore")

In [2]:
files = os.listdir("./data/Sentences/")
prefix_sentence = "./data/Sentences/"
prefix_label = "./data/Labels/"

clauses = []
for file in files:
    sentence_file_path = prefix_sentence + file 
    label_file_path = prefix_label + file
    sentences_df = pd.read_csv(sentence_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentences"]
    label_df = pd.read_csv(label_file_path, sep=" ", header=None)
    label_df.columns = ["label"]
    label_df["label_converted"] = np.where(label_df["label"] == -1, 0, 1)
    sentences_df["document"] = file
    df_concat = pd.concat([label_df["label_converted"], sentences_df], axis=1)
    clauses.append(df_concat)

In [3]:
colnames = ["sentences", "label_converted", "document"]
clauses_df = pd.DataFrame(columns = colnames)
for df in clauses:
    clauses_df = clauses_df.append(df)

In [4]:
clauses_df.rename(columns={'label_converted': 'label', 'sentences': 'sentences', 'document' : 'document'}, inplace=True)

In [5]:
clauses_df

Unnamed: 0,sentences,label,document
0,thanks for sending us good vibes by using the ...,0,Viber.txt
1,"you may be surprised , but we will refer to al...",0,Viber.txt
2,"the terms of use -lrb- or , the `` terms '' -r...",0,Viber.txt
3,the language of the terms will seem legal -lrb...,0,Viber.txt
4,"when you use our services , in addition to enj...",1,Viber.txt
...,...,...,...
142,the failure of onavo to enforce any right or p...,0,Onavo.txt
143,the section headings in the agreement are incl...,0,Onavo.txt
144,"`` including '' , whether capitalized or not ,...",0,Onavo.txt
145,this agreement may not be assigned by you with...,0,Onavo.txt


In [6]:
assert (clauses_df.isnull().sum().all() == 0)

In [7]:
def to_lower(data: pd.Series):
    return data.str.lower()

def remove_accented_characters(data: pd.Series):
    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))

def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"\d+;", " ", regex=True)

def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

def remove_html_and_url(data: pd.Series):
    data.str.replace(r"\d+;", " ", regex=True)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)
    return data

def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)
                     
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)

def fix_contractions(data: pd.Series):
    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])
    return data.apply(contraction_fixer)

def remove_special_words(data: pd.Series):
    return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)
                     
data_cleaning_pipeline = {
    "sentences": [
        to_lower,
        remove_special_words,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = clauses_df.copy()

# for col, pipeline in data_cleaning_pipeline.items():
#     temp_data = cleaned_data[col].copy()
#     for func in pipeline:
#         print(f"Starting: {func.__name__}")
#         temp_data = func(temp_data)
#         print(f"Ended: {func.__name__}")
#     cleaned_data[col] = temp_data.copy()

In [8]:
cleaned_data

Unnamed: 0,sentences,label,document
0,thanks for sending us good vibes by using the ...,0,Viber.txt
1,"you may be surprised , but we will refer to al...",0,Viber.txt
2,"the terms of use -lrb- or , the `` terms '' -r...",0,Viber.txt
3,the language of the terms will seem legal -lrb...,0,Viber.txt
4,"when you use our services , in addition to enj...",1,Viber.txt
...,...,...,...
142,the failure of onavo to enforce any right or p...,0,Onavo.txt
143,the section headings in the agreement are incl...,0,Onavo.txt
144,"`` including '' , whether capitalized or not ,...",0,Onavo.txt
145,this agreement may not be assigned by you with...,0,Onavo.txt


In [9]:
cleaned_data.document.unique()

array(['Viber.txt', 'Nintendo.txt', 'Tinder.txt', 'Dropbox.txt',
       'Microsoft.txt', 'Betterpoints_UK.txt', 'Airbnb.txt',
       'musically.txt', 'Crowdtangle.txt', 'TripAdvisor.txt',
       'Deliveroo.txt', 'Moves-app.txt', 'Spotify.txt', 'Supercell.txt',
       '9gag.txt', 'Booking.txt', 'Headspace.txt', 'Fitbit.txt',
       'Syncme.txt', 'Vimeo.txt', 'Oculus.txt', 'Endomondo.txt',
       'Instagram.txt', 'LindenLab.txt', 'WorldOfWarcraft.txt',
       'YouTube.txt', 'Academia.txt', 'Yahoo.txt', 'WhatsApp.txt',
       'Google.txt', 'Zynga.txt', 'Facebook.txt', 'Amazon.txt',
       'Vivino.txt', 'Netflix.txt', 'PokemonGo.txt', 'Skype.txt',
       'Snap.txt', 'eBay.txt', 'Masquerade.txt', 'Twitter.txt',
       'LinkedIn.txt', 'Skyscanner.txt', 'Duolingo.txt', 'TrueCaller.txt',
       'Uber.txt', 'Rovio.txt', 'Atlas.txt', 'Evernote.txt', 'Onavo.txt'],
      dtype=object)

In [10]:
logo = LeaveOneGroupOut()
X = cleaned_data['sentences']
y = cleaned_data['label']
group = cleaned_data['document']
logo.get_n_splits(X, y, group)

50

In [11]:
train_val_test = []
for train_val_index, test_index in logo.split(X, y, group):
    train_val, test = cleaned_data.iloc[train_val_index], cleaned_data.iloc[test_index]
    train_val_test.append((train_val, test))

In [12]:
ngram_ranges = [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]
scores_compare = {}
for nrange in ngram_ranges:
    scores = []
    for batch in train_val_test:
        X_train = batch[0]["sentences"]
        y_train = batch[0]["label"]
        train_groups = batch[0]["document"]
        X_test = batch[1]["sentences"]
        y_test = batch[1]["label"]
    
        test_document = batch[1].document.unique()[0]
        vectorizer = TfidfVectorizer(lowercase = True, ngram_range = nrange)
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        y_train = y_train.astype('int')
        y_test = y_test.astype('int')
    
        svm = LinearSVC(random_state=0, max_iter = 5000)
        Cs = [0.001, 0.01, 0.1, 1, 10]
        clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
        clf.fit(X_train, y_train, groups = train_groups)
        score = clf.score(X_test, y_test)
        scores.append(score)
    average_test_f1_score = sum(scores)/len(scores)
    scores_compare[nrange] = average_test_f1_score

scores_compare

{(1, 1): 0.724900985175306,
 (1, 2): 0.7724462950124539,
 (2, 2): 0.7559621739777096,
 (1, 3): 0.7896037445430559,
 (2, 3): 0.7722617420803138,
 (3, 3): 0.7424302970541219}

In [13]:
print("Average F1 score from Leave one out document : " + str(max(scores_compare.values())))

Average F1 score from Leave one out document : 0.7896037445430559


In [14]:
print("N grams that gave maximum F1 score: " + str(max(scores_compare, key=scores_compare.get)))

N grams that gave maximum F1 score: (1, 3)
