# Classification of Privacy and Opt-Out documents

- This script uses various classification models to determine which model has the highest accuracy, precision, recall, and F1 scores best possible way to classify privacy and opt out documents.

In [41]:
# Import dependencies
import warnings
import nltk
import pandas as pd
import numpy as np
import scipy as sp
from time import time, sleep
import re
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Import package to make cm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold,cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from pickle import load,dump

In [5]:
# Function to compute model accuracy, recall, precision, f1 scores from confusion matrices
def compute_accuracy(cm):
    TP = cm[0, 0]
    TN = cm[1,1]
    FP = cm [0,1]
    FN = cm[1,0]
    accuracy = (TP + TN)/ (TP + TN + FP + FN)
    return accuracy

def compute_precision(cm):
    TP = cm[0, 0]
    TN = cm[1,1]
    FP = cm [0,1]
    FN = cm[1,0]
    precision = TP / (TP + FP)
    return precision

def compute_recall(cm):
    TP = cm[0, 0]
    TN = cm[1,1]
    FP = cm [0,1]
    FN = cm[1,0]
    recall = TP / (TP + FN)
    return recall

def compute_f1_score(cm):
    precision = compute_precision(cm = cm)
    recall = compute_recall(cm = cm) 
    F1 = 2 * precision * recall / (precision + recall)
    return F1

In [9]:
# Load data
df = load(open( "/Users/dbm/Documents/Insight S19/data/privacy_optout_policy.pkl", "rb" ))
data_words = load(open( "/Users/dbm/Documents/Insight S19/data/privacy_optout_policy_cleaned.pkl", "rb" ))
df.head(1)

Unnamed: 0,doc_type,file,text,year
0,pp,/Users/dbm/Downloads/OPP-115/sanitized_policie...,Privacy Policy Last Modified March 25 2013 Thi...,2013


In [10]:
# Convert doc_type to a binary variable
df['doc_type_bool']=[1 if i == 'pp' else 0 for i in df['doc_type'] ]

In [11]:
# Count design matrix
# The transformation will return a matrix of size (Documents x Features), 
# where the value of a cell is going to be the number of times the feature (word) appears in that document
vectorizer = CountVectorizer(analyzer='word', lowercase=False)
x_counts = vectorizer.fit_transform(data_words)
x_counts_arr = x_counts.toarray()
x_counts_arr.shape

(229, 7963)

In [12]:
# TfIdf Transformer: transform the counts with the model
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)
x_tfidf_arr = x_tfidf.toarray()

In [13]:
# Normalize the TfIdf values to unit length for each row
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)
xtfidf_norm_arr = normalize(x_tfidf, norm='l1', axis=1).toarray()

In [14]:
# Assign X and y to respoective values for classification
X = x_counts_arr#x_counts_arr
y = df['doc_type_bool']

In [15]:
# Split dataset into trIaining AND TEST
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state = 0)

## Logistic regression classifier

In [17]:
# Simplest possible model
logr = LogisticRegression()
logr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
# Predict test values
logr_y_pred = logr.predict(X_test)

In [22]:
# Making the confusion matrix
logr_cm = confusion_matrix(y_test, logr_y_pred)
print(f'Logistic regression accuracy: {compute_accuracy(logr_cm)}')
      
print(classification_report(y_test, logr_y_pred))      

Logistic regression claaccuracy: 0.5
              precision    recall  f1-score   support

           0       0.57      0.46      0.51        26
           1       0.44      0.55      0.49        20

   micro avg       0.50      0.50      0.50        46
   macro avg       0.51      0.51      0.50        46
weighted avg       0.51      0.50      0.50        46



## Naive bayes classifier

In [24]:
# Fit Naive Bayes to training set
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
nb_y_pred = nb.predict(X_test)

In [26]:
nb_cm = confusion_matrix(y_test, nb_y_pred)
print(f'Naive Bayes regression accuracy: {compute_accuracy(nb_cm)}')

print(classification_report(y_test, nb_y_pred))

Naive Bayes regression accuracy: 0.2391304347826087
              precision    recall  f1-score   support

           0       0.32      0.31      0.31        26
           1       0.14      0.15      0.15        20

   micro avg       0.24      0.24      0.24        46
   macro avg       0.23      0.23      0.23        46
weighted avg       0.24      0.24      0.24        46



## Random forest

In [42]:
# Random Forest classifier
rfr = RandomForestClassifier(random_state=0)
rfr.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [44]:
rfr_y_pred = rfr.predict(X_test)

In [46]:
rfr_cm = confusion_matrix(y_test, rfr_y_pred)
print(f'Random Forest regression accuracy: {compute_accuracy(rfr_cm)}')

print(classification_report(y_test, rfr_y_pred))

Naive Bayes regression accuracy: 0.32608695652173914
              precision    recall  f1-score   support

           0       0.37      0.27      0.31        26
           1       0.30      0.40      0.34        20

   micro avg       0.33      0.33      0.33        46
   macro avg       0.33      0.33      0.33        46
weighted avg       0.34      0.33      0.32        46

