#### Training 8 SVMs with Words and Pos Tags 

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import re
import string
import os
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

##### Use following to merge words in sentences with its respective pos tags

In [2]:
files = os.listdir("./data/Postags/")
prefix_sentences = "./data/Sentences/"
prefix_pos = "./data/Postags/"
prefix_label_A = "./data/Labels_A/"
prefix_label_CH = "./data/Labels_CH/"
prefix_label_CR = "./data/Labels_CR/"
prefix_label_J = "./data/Labels_J/"
prefix_label_LAW = "./data/Labels_LAW/"
prefix_label_LTD = "./data/Labels_LTD/"
prefix_label_TER = "./data/Labels_TER/"
prefix_label_USE = "./data/Labels_USE/"
prefix_label = "./data/Labels/"

def merge(sentence1, sentence2):
    sentence = ""
    s1splits = sentence1.split()
    s2splits = sentence2.split()
    for w1,w2 in zip(s1splits,s2splits):
        sentence = sentence + w1 + "/" + w2 + " "
    sentence.strip()
    return sentence
    
word_pos = []
for file in files:
    label_file_path_A = prefix_label_A + file
    label_file_path_CH = prefix_label_CH + file
    label_file_path_CR = prefix_label_CR + file
    label_file_path_J = prefix_label_J + file
    label_file_path_LAW = prefix_label_LAW + file
    label_file_path_LTD = prefix_label_LTD + file
    label_file_path_TER = prefix_label_TER + file
    label_file_path_USE = prefix_label_USE + file
    label_file_path = prefix_label + file
    sentences_file_path = prefix_sentences + file
    postag_file_path = prefix_pos + file
    pos_df = pd.read_csv(postag_file_path, sep="dummy_separator", header=None)
    pos_df.columns = ["postag"]
    sentences_df = pd.read_csv(sentences_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentence"]
    sentences_df["postag"] = pos_df["postag"]
    sentences_df["merged"] = "dummy text"
    for index, row in sentences_df.iterrows():
        row["merged"] = merge(row['sentence'], row['postag'])
        
    label_A_df = pd.read_csv(label_file_path_A, sep=" ", header=None, names=["label_A"])
    label_A_df["label_A_converted"] = np.where(label_A_df["label_A"] == -1, 0, 1)

    label_CH_df = pd.read_csv(label_file_path_CH, sep=" ", header=None, names=["label_CH"])
    label_CH_df["label_CH_converted"] = np.where(label_CH_df["label_CH"] == -1, 0, 1)

    label_CR_df = pd.read_csv(label_file_path_CR, sep=" ", header=None, names=["label_CR"])
    label_CR_df["label_CR_converted"] = np.where(label_CR_df["label_CR"] == -1, 0, 1)

    label_J_df = pd.read_csv(label_file_path_J, sep=" ", header=None, names=["label_J"])
    label_J_df["label_J_converted"] = np.where(label_J_df["label_J"] == -1, 0, 1)

    label_LAW_df = pd.read_csv(label_file_path_LAW, sep=" ", header=None, names=["label_LAW"])
    label_LAW_df["label_LAW_converted"] = np.where(label_LAW_df["label_LAW"] == -1, 0, 1)

    label_LTD_df = pd.read_csv(label_file_path_LTD, sep=" ", header=None, names=["label_LTD"])
    label_LTD_df["label_LTD_converted"] = np.where(label_LTD_df["label_LTD"] == -1, 0, 1)

    label_TER_df = pd.read_csv(label_file_path_TER, sep=" ", header=None, names=["label_TER"])
    label_TER_df["label_TER_converted"] = np.where(label_TER_df["label_TER"] == -1, 0, 1)

    label_USE_df = pd.read_csv(label_file_path_USE, sep=" ", header=None, names=["label_USE"])
    label_USE_df["label_USE_converted"] = np.where(label_USE_df["label_USE"] == -1, 0, 1)

    label_df = pd.read_csv(label_file_path, sep=" ", header=None, names=["label"])
    label_df["label_converted"] = np.where(label_df["label"] == -1, 0, 1)

    sentences_df["document"] = file
    df_concat = pd.concat([label_df["label_converted"], label_A_df["label_A_converted"], label_CH_df["label_CH_converted"], label_CR_df["label_CR_converted"],
                           label_J_df["label_J_converted"], label_LAW_df["label_LAW_converted"], label_LTD_df["label_LTD_converted"],
                           label_TER_df["label_TER_converted"], label_USE_df["label_USE_converted"], sentences_df[["merged", "document"]]], axis=1)
    word_pos.append(df_concat)

In [3]:
colnames = ["label_converted", "label_A_converted", "label_CH_converted", "label_CR_converted", "label_J_converted", "label_LAW_converted", "label_LTD_converted", "label_TER_converted", "label_USE_converted", "merged", "document"]
clauses_df = pd.DataFrame(columns = colnames)
for df in word_pos:
    clauses_df = clauses_df.append(df)

In [4]:
clauses_df.rename(columns={'label_converted': 'label', 'label_A_converted': 'label_A', 'label_CH_converted': 'label_CH', 'label_CR_converted': 'label_CR', 
                           'label_J_converted': 'label_J', 'label_LAW_converted': 'label_LAW', 'label_LTD_converted': 'label_LTD', 
                           'label_TER_converted': 'label_TER', 'label_USE_converted': 'label_USE',
                           'merged': 'sentences', 'document' : 'document'}, inplace=True)
clauses_df.to_csv("data/svm8_word_pos_merged.csv", index = False)

In [5]:
clauses_df.head()

Unnamed: 0,label,label_A,label_CH,label_CR,label_J,label_LAW,label_LTD,label_TER,label_USE,sentences,document
0,0,0,0,0,0,0,0,0,0,thanks/NNS for/IN sending/VBG us/PRP good/JJ v...,Viber.txt
1,0,0,0,0,0,0,0,0,0,"you/PRP may/MD be/VB surprised/VBN ,/, but/CC ...",Viber.txt
2,0,0,0,0,0,0,0,0,0,the/DT terms/NNS of/IN use/NN -lrb-/-LRB- or/C...,Viber.txt
3,0,0,0,0,0,0,0,0,0,the/DT language/NN of/IN the/DT terms/NNS will...,Viber.txt
4,1,0,0,0,0,0,0,0,1,when/WRB you/PRP use/VBP our/PRP$ services/NNP...,Viber.txt


In [6]:
assert (clauses_df.isnull().sum().all() == 0)

In [7]:
clauses_df.document.unique()

array(['Viber.txt', 'Nintendo.txt', 'Tinder.txt', 'Dropbox.txt',
       'Microsoft.txt', 'Betterpoints_UK.txt', 'Airbnb.txt',
       'musically.txt', 'Crowdtangle.txt', 'TripAdvisor.txt',
       'Deliveroo.txt', 'Moves-app.txt', 'Spotify.txt', 'Supercell.txt',
       '9gag.txt', 'Booking.txt', 'Headspace.txt', 'Fitbit.txt',
       'Syncme.txt', 'Vimeo.txt', 'Oculus.txt', 'Endomondo.txt',
       'Instagram.txt', 'LindenLab.txt', 'WorldOfWarcraft.txt',
       'YouTube.txt', 'Academia.txt', 'Yahoo.txt', 'WhatsApp.txt',
       'Google.txt', 'Zynga.txt', 'Facebook.txt', 'Amazon.txt',
       'Vivino.txt', 'Netflix.txt', 'PokemonGo.txt', 'Skype.txt',
       'Snap.txt', 'eBay.txt', 'Masquerade.txt', 'Twitter.txt',
       'LinkedIn.txt', 'Skyscanner.txt', 'Duolingo.txt', 'TrueCaller.txt',
       'Uber.txt', 'Rovio.txt', 'Atlas.txt', 'Evernote.txt', 'Onavo.txt'],
      dtype=object)

In [8]:
clauses_df[['label', 'label_A', 'label_CH', 'label_CR', 'label_J', 'label_LAW', 'label_LTD', 'label_TER', 'label_USE']]

Unnamed: 0,label,label_A,label_CH,label_CR,label_J,label_LAW,label_LTD,label_TER,label_USE
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
142,0,0,0,0,0,0,0,0,0
143,0,0,0,0,0,0,0,0,0
144,0,0,0,0,0,0,0,0,0
145,0,0,0,0,0,0,0,0,0


In [9]:
logo = LeaveOneGroupOut()
X = clauses_df['sentences']
y = clauses_df[['label', 'label_A', 'label_CH', 'label_CR', 'label_J', 'label_LAW', 'label_LTD', 'label_TER', 'label_USE']]
group = clauses_df['document']
logo.get_n_splits(X, y, group)

50

In [10]:
train_val_test = []
for train_val_index, test_index in logo.split(X, y, group):
    train_val, test = clauses_df.iloc[train_val_index], clauses_df.iloc[test_index]
    train_val_test.append((train_val, test))

### Train on Labels_A.txt

In [11]:
# TF - IDF extraction
scores = []
y_test_pred_A = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_A"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_A"]
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_A.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [12]:
print(sum(scores)/len(scores))

0.29999999999999993


### Train on Labels_CH.txt

In [13]:
# TF - IDF extraction
scores = []
y_test_pred_CH = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_CH"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_CH"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_CH.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [14]:
print(sum(scores)/len(scores))

0.6321932129173511


### Train on Labels_CR.txt

In [15]:
# TF - IDF extraction
scores = []
y_test_pred_CR = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_CR"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_CR"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_CR.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [16]:
print(sum(scores)/len(scores))

0.5299740259740261


### Train on Labels_J.txt

In [17]:
# TF - IDF extraction
scores = []
y_test_pred_J = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_J"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_J"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_J.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [18]:
print(sum(scores)/len(scores))

0.6178484848484849


### Train on Labels_LAW.txt

In [19]:
# TF - IDF extraction
scores = []
y_test_pred_LAW = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_LAW"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_LAW"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_LAW.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [20]:
print(sum(scores)/len(scores))

0.7746666666666667


### Train on Labels_LTD.txt

In [21]:
# TF - IDF extraction
scores = []
y_test_pred_LTD = []

for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_LTD"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_LTD"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_LTD.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [22]:
print(sum(scores)/len(scores))

0.66821146139722


### Train on Labels_TER.txt

In [23]:
# TF - IDF extraction
scores = []
y_test_pred_TER = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_TER"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_TER"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_TER.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [24]:
print(sum(scores)/len(scores))

0.6574006810183282


### Train on Labels_USE.txt

In [25]:
# TF - IDF extraction
scores = []
y_test_pred_USE = []
y_test_all = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label_USE"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label_USE"]
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    y_test_all.extend(batch[1]["label"])
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
    clf_fit = clf.fit(X_train, y_train, groups = train_groups)
    clf_best = clf_fit.best_estimator_
    y_test_pred_USE.extend(clf_best.predict(X_test))
    score = clf.score(X_test, y_test)
    scores.append(score)

In [26]:
print(sum(scores)/len(scores))

0.7015714285714288


### Combine predictions

In [27]:
y_pred = []
for idx in range(len(y_test_all)):
    if y_test_pred_A[idx] == 1 or y_test_pred_CH[idx] == 1 or y_test_pred_CR[idx] == 1 or y_test_pred_J[idx] == 1 or y_test_pred_LTD[idx] == 1 or y_test_pred_TER[idx] == 1 or y_test_pred_USE[idx] == 1:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [28]:
report = classification_report(y_test_all, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      8382
           1       0.83      0.61      0.70      1032

    accuracy                           0.94      9414
   macro avg       0.89      0.80      0.83      9414
weighted avg       0.94      0.94      0.94      9414

