### Basic SVM with Train and Test split (80% - 20%)

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)
import pandas as pd
import numpy as np
import re
import string
import os
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

In [2]:
files = os.listdir("./data/Sentences/")
prefix_sentence = "./data/Sentences/"
prefix_label = "./data/Labels/"

# -1 --> 0 and 1 --> 1
clauses = []
for file in files:
    sentence_file_path = prefix_sentence + file 
    label_file_path = prefix_label + file
    sentences_df = pd.read_csv(sentence_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentences"]
    label_df = pd.read_csv(label_file_path, sep=" ", header=None)
    label_df.columns = ["label"]
    label_df["label_converted"] = np.where(label_df["label"] == -1, 0, 1)
    sentences_df["document"] = file
    df_concat = pd.concat([label_df["label_converted"], sentences_df], axis=1)
    clauses.append(df_concat)

In [3]:
colnames = ["sentences", "label_converted", "document"]
clauses_df = pd.DataFrame(columns = colnames)
for df in clauses:
    clauses_df = clauses_df.append(df)

In [4]:
clauses_df.rename(columns={'label_converted': 'label', 'sentences': 'sentences', 'document' : 'document'}, inplace=True)

In [5]:
clauses_df

Unnamed: 0,sentences,label,document
0,thanks for sending us good vibes by using the ...,0,Viber.txt
1,"you may be surprised , but we will refer to al...",0,Viber.txt
2,"the terms of use -lrb- or , the `` terms '' -r...",0,Viber.txt
3,the language of the terms will seem legal -lrb...,0,Viber.txt
4,"when you use our services , in addition to enj...",1,Viber.txt
...,...,...,...
142,the failure of onavo to enforce any right or p...,0,Onavo.txt
143,the section headings in the agreement are incl...,0,Onavo.txt
144,"`` including '' , whether capitalized or not ,...",0,Onavo.txt
145,this agreement may not be assigned by you with...,0,Onavo.txt


In [6]:
assert (clauses_df.isnull().sum().all() == 0)

In [7]:
clauses_df.document.unique()

array(['Viber.txt', 'Nintendo.txt', 'Tinder.txt', 'Dropbox.txt',
       'Microsoft.txt', 'Betterpoints_UK.txt', 'Airbnb.txt',
       'musically.txt', 'Crowdtangle.txt', 'TripAdvisor.txt',
       'Deliveroo.txt', 'Moves-app.txt', 'Spotify.txt', 'Supercell.txt',
       '9gag.txt', 'Booking.txt', 'Headspace.txt', 'Fitbit.txt',
       'Syncme.txt', 'Vimeo.txt', 'Oculus.txt', 'Endomondo.txt',
       'Instagram.txt', 'LindenLab.txt', 'WorldOfWarcraft.txt',
       'YouTube.txt', 'Academia.txt', 'Yahoo.txt', 'WhatsApp.txt',
       'Google.txt', 'Zynga.txt', 'Facebook.txt', 'Amazon.txt',
       'Vivino.txt', 'Netflix.txt', 'PokemonGo.txt', 'Skype.txt',
       'Snap.txt', 'eBay.txt', 'Masquerade.txt', 'Twitter.txt',
       'LinkedIn.txt', 'Skyscanner.txt', 'Duolingo.txt', 'TrueCaller.txt',
       'Uber.txt', 'Rovio.txt', 'Atlas.txt', 'Evernote.txt', 'Onavo.txt'],
      dtype=object)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(
    clauses_df['sentences'],
    clauses_df['label'],
    train_size=0.8, 
    test_size=0.2, 
    random_state=0, 
    shuffle=True,
    stratify=clauses_df['label'])
print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))

# TF - IDF extraction
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Train:  (7531,) (7531,) Test:  ((1883,), (1883,))


In [9]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [11]:
svm = LinearSVC(C=0.5, random_state=0).fit(X_train, Y_train)
Y_test_pred = svm.predict(X_test)
report = classification_report(Y_test, Y_test_pred)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1677
           1       0.88      0.62      0.72       206

    accuracy                           0.95      1883
   macro avg       0.92      0.80      0.85      1883
weighted avg       0.95      0.95      0.94      1883

