# Title

In [412]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('words')
import os
import re
import sqlalchemy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\brand\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Dataset

In [413]:
CWD = os.getcwd()
dataset_engine = sqlalchemy.create_engine(f"sqlite:///{CWD}/modeling_dataset.db")
dataset_engine.connect()

<sqlalchemy.engine.base.Connection at 0x1bfa7bcd510>

In [414]:
tables = pd.read_sql("SELECT * FROM sqlite_master WHERE type = 'table'", dataset_engine)
tables

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,by,by,2,"CREATE TABLE ""by"" (\n\t""index"" BIGINT, \n\tlic..."
1,table,by-sa,by-sa,28,"CREATE TABLE ""by-sa"" (\n\t""index"" BIGINT, \n\t..."
2,table,by-nc,by-nc,410,"CREATE TABLE ""by-nc"" (\n\t""index"" BIGINT, \n\t..."
3,table,by-nc-sa,by-nc-sa,436,"CREATE TABLE ""by-nc-sa"" (\n\t""index"" BIGINT, \..."
4,table,by-nd,by-nd,1223,"CREATE TABLE ""by-nd"" (\n\t""index"" BIGINT, \n\t..."
5,table,by-nc-nd,by-nc-nd,1590,"CREATE TABLE ""by-nc-nd"" (\n\t""index"" BIGINT, \..."
6,table,publicdomain,publicdomain,803,"CREATE TABLE publicdomain (\n\t""index"" BIGINT,..."


In [415]:
webpages_dataset = pd.concat(
    [
        pd.read_sql(f"SELECT * FROM '{table_name}'", dataset_engine)
        for table_name in tables["name"]
    ]
)
webpages_dataset = webpages_dataset\
    .loc[webpages_dataset["contents"] != "", :]\
    .reset_index()\
    .drop(["index", "title", "level_0"], axis=1)
webpages_dataset.sample(5)

Unnamed: 0,license,url,contents
4835,licenses/by-nc-nd/2.5,https://pitt.libguides.com/copyright/licenses,"Creative Commons, Copyleft, and Other Licenses..."
5590,publicdomain/mark/1.0,https://rightsstatements.org/vocab/1.0/,Rights Statements Menu Statements About Docume...
4349,licenses/by-nd/2.5,https://podcasts.apple.com/us/podcast/entrepre...,‎Entrepreneurial Thought Leaders on Apple Podc...
5159,licenses/by-nc-nd/2.0,https://www.livescience.com/youngest-age-give-...,What's the youngest age that a person can get ...
4646,licenses/by-nc-nd/2.0,https://www.americanbar.org/groups/human_right...,Request unsuccessful. Incapsula incident ID: 2...


In [416]:
webpages_dataset_deduplicate = webpages_dataset.groupby("url").first()\
    .reset_index()
webpages_dataset_deduplicate.describe()

Unnamed: 0,url,license,contents
count,1844,1844,1844
unique,1844,39,1788
top,http://agroportal.lirmm.fr/ontologies/PCO,licenses/by/2.1,403 Forbidden 403 Forbidden nginx
freq,1,130,25


In [417]:
import dataset_sampling
license_map = dataset_sampling.get_license_map()
license_ser = pd.concat([v for v in license_map.values()])
license_ser_splits_df = license_ser.str.split("/", expand=True)
license_ser_splits_df = license_ser_splits_df.rename(
    columns = {
        0: "Tool Typing",
        1: "General Typing",
        2: "Version",
        3: "Jurisdiction"
    }
)
license_ser_splits_df["General Typing"] = license_ser_splits_df["General Typing"].str.replace("mark|zero", "publicdomain", regex=True)
license_ser_splits_df["General Typing"] = license_ser_splits_df["General Typing"].str.replace("by-nd-nc", "by-nc-nd", regex=True)
license_ser_splits_df["Version"] = license_ser_splits_df["Version"].astype(float)
license_one_hot_encoding = pd.DataFrame()
license_one_hot_encoding["by"] = license_ser_splits_df["General Typing"].str.contains("by")
license_one_hot_encoding["sa"] = license_ser_splits_df["General Typing"].str.contains("sa")
license_one_hot_encoding["nc"] = license_ser_splits_df["General Typing"].str.contains("nc")
license_one_hot_encoding["nd"] = license_ser_splits_df["General Typing"].str.contains("nd")
license_not_six_type = license_ser_splits_df["General Typing"].str.contains("by|sa|nc|nd")
license_one_hot_encoding["neither"] = ~(license_not_six_type.fillna(False))
license_df = pd.concat([license_ser, license_ser_splits_df, license_one_hot_encoding], axis = 1)\
    .rename(columns = {0: "license"})
license_df.head(6)

Unnamed: 0,license,Tool Typing,General Typing,Version,by,sa,nc,nd,neither
0,licenses/by/1.0,licenses,by,1.0,True,False,False,False,False
14,licenses/by/2.0,licenses,by,2.0,True,False,False,False,False
27,licenses/by/2.1,licenses,by,2.1,True,False,False,False,False
33,licenses/by/2.5,licenses,by,2.5,True,False,False,False,False
39,licenses/by/3.0,licenses,by,3.0,True,False,False,False,False
45,licenses/by/4.0,licenses,by,4.0,True,False,False,False,False


In [418]:
webpages_dataset_deduplicate = webpages_dataset_deduplicate.merge(license_df, on = "license")
webpages_dataset_deduplicate.sample(5)

Unnamed: 0,url,license,contents,Tool Typing,General Typing,Version,by,sa,nc,nd,neither
1220,https://www.archdaily.com/892597/ad-classics-f...,licenses/by/2.0,Gallery of AD Classics: French Communist Party...,licenses,by,2.0,True,False,False,False,False
278,https://journals.tulane.edu/SL/about/submissions,licenses/by/3.0,Submissions\n\t\t\t\t\t\t\t| Second Line - An ...,licenses,by,3.0,True,False,False,False,False
1425,https://www.google.com/help/legalnotices_maps/,licenses/by/1.0,Legal Notices for Google Maps/Google Earth and...,licenses,by,1.0,True,False,False,False,False
407,http://kolibri.teacherinabox.org.au/modules/en...,licenses/by-sa/2.1,Creative Commons — Attribution-ShareAlike 2.1 ...,licenses,by-sa,2.1,True,True,False,False,False
92,https://www.repository.cam.ac.uk/handle/1810/2...,publicdomain/zero/1.0,Reduced monocyte and macrophage TNFSF15/TL1A e...,publicdomain,publicdomain,1.0,False,False,False,False,True


In [419]:
webpages_dataset_deduplicate.groupby("General Typing").count()

Unnamed: 0_level_0,url,license,contents,Tool Typing,Version,by,sa,nc,nd,neither
General Typing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
by,720,720,720,720,720,720,720,720,720,720
by-nc,225,225,225,225,225,225,225,225,225,225
by-nc-nd,107,107,107,107,107,107,107,107,107,107
by-nc-sa,188,188,188,188,188,188,188,188,188,188
by-nd,117,117,117,117,117,117,117,117,117,117
by-sa,263,263,263,263,263,263,263,263,263,263
publicdomain,224,224,224,224,224,224,224,224,224,224


In [420]:
#webpages_dataset_deduplicate["contents"] = webpages_dataset_deduplicate["contents"].apply(lambda x: x[:4000])

In [421]:
import string
def remove_unicodes(ser):
    return ser.map(lambda x: " ".join([c for c in x if c in string.printable]))

def has_unicodes(s, tolerance = 25):
    return np.sum([c not in string.printable for c in s]) <= tolerance

def not_well_decrypted(s, tolerance = 25):
    words = re.split(r"\s+", s)
    return np.sum([len(c) == 1 for c in words]) <= tolerance

def remove_unicodes_aggressive(df, field_name = "contents"):
    df_remove_unicode = df.loc[df[field_name].apply(has_unicodes), :]
    return df_remove_unicode


In [422]:
# Less Aggressive pruning
#webpages_dataset_deduplicate["parsed_contents"] = remove_unicodes(webpages_dataset_deduplicate["contents"])
#webpages_dataset_deduplicate.loc[1149, ["contents", "parsed_contents"]]

# More Aggressive pruning
webpages_dataset_deduplicate = remove_unicodes_aggressive(webpages_dataset_deduplicate)

In [423]:
def remove_less_than_c_chars(s, tolerance = 2):
    words = re.split(r"\s+", s)
    return " ".join([c for c in words if len(c) > tolerance and "obj" not in c])

def remove_more_than_c_chars(s, tolerance = 15):
    words = re.split(r"\s+", s)
    return " ".join([c for c in words if len(c) <= tolerance])

def remove_non_english(s):
    words = set(nltk.corpus.words.words())
    return " ".join(w for w in re.split(r"\s+", s) if w in words)

def remove_web_urls(s):
    return re.sub(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", " ", s)

def clear_stopwords(s):
    return " ".join([word for word in re.split(r"\s+", s) if word.lower() not in nltk.corpus.stopwords.words('english')])

def clear_stopwords_series(ser):
    return ser.apply(clear_stopwords)

def overall_cleaning(ser):
    cleaned_ser = ser.str.lower()
    cleaned_ser = cleaned_ser.apply(remove_web_urls)
    cleaned_ser = cleaned_ser.str.replace(r'[^A-Za-z\s]', ' ', regex = True)
    cleaned_ser = cleaned_ser.str.replace(r"\s+", " ", regex = True)
    cleaned_ser = cleaned_ser.apply(clear_stopwords)
    cleaned_ser = cleaned_ser.apply(remove_less_than_c_chars)
    cleaned_ser = cleaned_ser.apply(remove_non_english)
    #cleaned_ser = cleaned_ser.apply(remove_more_than_c_chars)
    return cleaned_ser

webpages_dataset_deduplicate["cleaned_contents"] = overall_cleaning(webpages_dataset_deduplicate["contents"])
webpages_dataset_deduplicate = webpages_dataset_deduplicate.loc[
    webpages_dataset_deduplicate["cleaned_contents"].str.len() >= 500, :
]
#webpages_dataset_deduplicate["parsed_cleaned_contents"] = clear_stopwords_series(webpages_dataset_deduplicate["cleaned_contents"])

In [424]:
for row in webpages_dataset_deduplicate["cleaned_contents"].sample(5):
    print(
        f"Entry content:\n{row[:300]}\n"
    )

Entry content:
least open creative commons university skip main content like explorer older works best modern latest chrome safari edge continue browser may see unexpected creative commons least open search guide search creative commons least open guide designed walk different comes license work make educated choi

Entry content:
history state agricultural college history state agricultural college rex repository search rex collection rex digital faculty works student works submit rex home state digital morse department special state university history view item disabled browser site may work without history state agricultur

Entry content:
fine toggle navigation learn meet staff wisdom pocket video collection consulting mineral photography symposium shop browse search new vault thumbnail corner bases order mineral sale sale connect contact login advanced search advanced search login collection shop selection large historic collection 

Entry content:
roll omega regular language learni

## Preprocessing

In [425]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import top_k_accuracy_score
from sklearn.decomposition import PCA
import re

In [426]:
def tokenize_url(url):
    return " ".join(re.split(r"[_/\.-]", re.sub(r"\d", "", url)))
tokenize_url("creativecommons.org/licenses/by-sa/4.0")

'creativecommons org licenses by sa  '

In [427]:
dataset = webpages_dataset_deduplicate.copy()
dataset['token_url'] = dataset["url"].apply(tokenize_url)
dataset["train_text"] = dataset["token_url"] + " " + dataset["cleaned_contents"]

In [428]:
def extract_text_features_tfidf(train, test, text_field = "train_text"):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_df=0.9, stop_words="english")
    tfidf_vectorizer.fit_transform(train[text_field].values)
    train_vectorized = tfidf_vectorizer.transform(train[text_field].values)
    test_vectorized = tfidf_vectorizer.transform(test[text_field].values)
    return train_vectorized, test_vectorized, tfidf_vectorizer

## Modeling

In [429]:
from imblearn.over_sampling import SMOTE

In [430]:
dataset.groupby("General Typing")[["url"]].count().reset_index()["url"]

0    523
1    163
2     72
3    134
4     73
5    168
6    167
Name: url, dtype: int64

In [438]:
model_dataset = pd.DataFrame()
dataset_counts = dataset.groupby("General Typing")[["url"]].count().reset_index()["url"]
license_dict = {
        "by": 0,
        "by-nc": 1,
        "by-nc-nd": 2,
        "by-nc-sa": 3,
        "by-nd": 4,
        "by-sa": 5,
        "publicdomain": 6
}

model_dataset["train_text"], model_dataset["General Typing"] = dataset["cleaned_contents"], dataset["General Typing"]
model_dataset["General Typing"].replace(
    license_dict,
    inplace = True
)

training_set, test_set = train_test_split(model_dataset, test_size = 0.2)
Y_train = training_set["General Typing"].values
Y_test = test_set["General Typing"].values
X_train, X_test, model_vecter = extract_text_features_tfidf(
    training_set, test_set
)

smote = SMOTE(
    sampling_strategy={
        k: int(0.85 * max(round(np.mean(dataset_counts.values)), dataset_counts.iloc[k]))
        for k in range(1, 7)} #
)
X_train, Y_train = smote.fit_resample(X_train, Y_train)


### log reg

In [445]:
model_logreg = LogisticRegression(
    verbose = 0,
    max_iter = 500,
    penalty = 'l2',
    solver = "liblinear",
    C = 0.5,
    class_weight = "balanced"
).fit(X_train, Y_train)

In [446]:
print(
    top_k_accuracy_score(Y_train, model_logreg.predict_proba(X_train), k = 1),
    "\n",
    top_k_accuracy_score(Y_train, model_logreg.predict_proba(X_train), k = 2),
    "\n",
    top_k_accuracy_score(Y_train, model_logreg.predict_proba(X_train), k = 3),
    "\n==============\n",
    top_k_accuracy_score(Y_test, model_logreg.predict_proba(X_test), k = 1),
    "\n",
    top_k_accuracy_score(Y_test, model_logreg.predict_proba(X_test), k = 2),
    "\n",
    top_k_accuracy_score(Y_test, model_logreg.predict_proba(X_test), k = 3)
)

0.8688644688644689 
 0.9567765567765568 
 0.9816849816849816 
 0.49230769230769234 
 0.6653846153846154 
 0.7884615384615384


### SVM

In [451]:
model_svc = SVC(
    C = 0.5,
    verbose = 1,
    probability = True,
    kernel = "poly",
    degree = 1
).fit(X_train, Y_train)

[LibSVM]

In [452]:
print(
    top_k_accuracy_score(Y_train, model_svc.predict_proba(X_train), k = 1),
    "\n",
    top_k_accuracy_score(Y_train, model_svc.predict_proba(X_train), k = 2),
    "\n",
    top_k_accuracy_score(Y_train, model_svc.predict_proba(X_train), k = 3),
    "\n==============\n",
    top_k_accuracy_score(Y_test, model_svc.predict_proba(X_test), k = 1),
    "\n",
    top_k_accuracy_score(Y_test, model_svc.predict_proba(X_test), k = 2),
    "\n",
    top_k_accuracy_score(Y_test, model_svc.predict_proba(X_test), k = 3)
)

0.9091575091575091 
 0.9772893772893773 
 0.9882783882783883 
 0.4653846153846154 
 0.6615384615384615 
 0.8038461538461539


### Tensorflow

In [453]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [454]:
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [455]:
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 32
TRAINING_RATIO = 0.7

model_dataset_copy = dataset.loc[:, ["cleaned_contents", "General Typing"]]
model_dataset_copy["General Typing"].replace(
    license_dict,
    inplace = True
)
target = model_dataset_copy.pop("General Typing")
# dataset_tensor = tf.convert_to_tensor(model_dataset, dtype=np.string_)
tf_dataset = (
    tf.data.Dataset
    .from_tensor_slices((model_dataset_copy, target))
    .shuffle(50)
)
train_dataset = tf_dataset.take(int(TRAINING_RATIO * len(tf_dataset)))
test_dataset = tf_dataset.skip(int(TRAINING_RATIO * len(tf_dataset)))
val_dataset = test_dataset.skip(
    int((1 - TRAINING_RATIO) * 0.5 * len(tf_dataset))
)
test_dataset = test_dataset.take(
    int((1 - TRAINING_RATIO) * 0.5 * len(tf_dataset))
)

train_dataset = train_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [456]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='parsed_cleaned_contents')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.5)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [457]:
from tensorflow_addons.metrics import F1Score

In [458]:
epochs = 10
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 1e-7
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [459]:
classifier_model = build_classifier_model()
classifier_model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=["accuracy"]
)
history = classifier_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 1/29 [>.............................] - ETA: 1:31 - loss: 7.1526e-07 - accuracy: 0.0000e+00

KeyboardInterrupt: 