# PDF

In [1]:
import re
import psutil
import zipfile
import chardet
import numpy as np
import subprocess
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score

## 0 Current RAM usage monitoring

In [2]:
def get_ram_usage():
    memory_info = psutil.Process().memory_info()
    return memory_info.rss / (1024 * 1024 * 1024)
print(f"Current RAM usage: {get_ram_usage():.2f} / 24 GB ({get_ram_usage()/24*100:.1f} %)")

Current RAM usage: 0.13 / 24 GB (0.6 %)


## 1 Process data

In [3]:
##### 1.1 Get files and extract features #####
    
# PDFID
def read_pdf_file_pdfid(file_path):
    output = subprocess.check_output(['python3', 'pdfid/pdfid.py', file_path])
    return output.decode('utf-8')

def extract_stats_pdfid(pdf):
    """
    Creates a dictionary with different possible elements for a pdf and their count in our pdf
    """
    pdf1 = pdf[pdf.find('\n')+1:]
    pdf2 = pdf1[pdf1.find('\n')+1:]
    values = list(map(int, re.findall(r'\d+', pdf2)[:-10] + re.findall(r'\d+', pdf2)[-9:-3] + re.findall(r'\d+', pdf2)[-1:]))
    keys = ['obj', 'endobj', 'stream', 'endstream', 'xref', 'trailer', 'startxref', 'Page', 'Encrypt', 'ObjStm', 'JS', 'JavaScript', 
            'AA', 'OpenAction', 'AcroForm', 'JBIG2Decode', 'RichMedia', 'Launch', 'EmbeddedFile', 'XFA', 'URI', 'Colors']
    return dict(zip(keys, values))

# PDF-PARSER
def read_pdf_file_pdfparser(file_path):
    output = subprocess.check_output(['python3', 'pdf-parser.py', file_path, '--stats'])
    return output.decode('utf-8')

def extract_stats_pdfparser_header(pdf):
    pdf_header = pdf[:pdf.find('/')].split('\n')[2:]
    l = []
    for line in pdf_header[:-1]:
        number = list(map(int, re.findall(r'\d+', line)))
        l += [number]
    if len(l) < 7:
        l.append([])
    l[-2] = len(l[-2])
    try:
        l[-1] = l[-1][0] if l[-1] != [] else 0
    except IndexError:
        print('list l:', l)
    stats_header = list(map(lambda x: x[0] if type(x) == list else x, l))
    return dict(zip(['Comment', 'XREF', 'Trailer', 'StartXref', 'Indirect object', 'Indirect objects with a stream', 'number:'], stats_header))

def extract_one_feature_pdfparser(pdf, feature):
    index = pdf.find(feature)
    found_digit = re.search(r'\d+', pdf[index:])
    if found_digit:
        return found_digit.group()
    else:
        return None

def extract_stats_pdfparser_features(pdf):
    interesting_features = [ '/JS ', '/JavaScript ', '/AA ', '/OpenAction ', '/URI ', '/Launch ', '/GoTo ', '/GoToR ', '/GoToE ', 
                            '/SubmitForm ', '/ImportData ', '/Sound ', '/Movie ', '/Rendition ', '/3D ', '/RichMedia ', '/EmbeddedFile ', 
                            '/FileAttachment ', '/Metadata ', '/XObject ', '/Page ', '/Pages ']
    values = []
    for feature in interesting_features:
        value = extract_one_feature_pdfparser(pdf, feature)
        values += [value] if value else [0]
    return dict(zip(interesting_features, values))

def extract_stats_pdfparser(pdf):
    return extract_stats_pdfparser_header(pdf) | extract_stats_pdfparser_features(pdf)    

def read_pdf_file(file_path):
    output = subprocess.check_output(['python3', 'pdf-parser.py', file_path])
    return output.decode('utf-8')

# def read_pdf_file(file_path):
#     with open(file_path, 'rb') as file:
#         pdf_content = file.read()
#     # Decode the file
#     encoding = chardet.detect(pdf_content)["encoding"]
#     pdf_content = pdf_content.decode(encoding if encoding != None else 'ascii', errors='replace')
#     return pdf_content

# Get features with PDFID, PDFPARSER (and BoW?)
def get_pdf_generator(pdf_filenames_train, labels_train, decoded_indices_train, train=True):
    errors_train = 0
    for i, pdfname in enumerate(pdf_filenames_train):
        try:
            labels_train += [int(pdfname[-1])] if train else []   
            if i % 100 == 0:
                print(f'get_rtf_generator() iter {i} done')
            # # PDFID
            # pdf_stats_pdfid = read_pdf_file_pdfid(pdfname)
            # pdf_stats_pdfid = extract_stats_pdfid(pdf_stats_pdfid)
            # stats_pdfid_train[i] = np.array(list(pdf_stats_pdfid.values())) 
            # # PDF-PARSER
            # pdf_stats_pdfparser = read_pdf_file_pdfparser(pdfname)
            # pdf_stats_pdfparser = extract_stats_pdfparser(pdf_stats_pdfparser)
            # try:
            #     stats_pdfparser_train[i] = np.array(list(pdf_stats_pdfparser.values())) 
            # except ValueError:
            #     stats_pdfparser_train[i] = np.zeros(29) 
            yield read_pdf_file(pdfname)
        except subprocess.CalledProcessError:
            errors_train += 1
            decoded_indices_train[i] = 0
            print(f'CalledProcessError {pdfname[-18:]}')

### VARIABLES ###
pdf_filenames_train = zipfile.ZipFile('pdf-train.zip').namelist()[:1000]
# pdf_filenames_train = zipfile.ZipFile('pdf-train.zip').namelist()[:-1]
print(f'Number of PDFs: {len(pdf_filenames_train)}')
decode = False
n = 3 # Value for n-grams

labels_train, decoded_indices_train = [], np.ones(len(pdf_filenames_train))
generator_pdf_train = get_pdf_generator(pdf_filenames_train, labels_train, decoded_indices_train, train=True)

vectorizer = CountVectorizer(strip_accents='unicode', ngram_range=(n,n), analyzer='word', min_df=5) #(100, 22673) RF 0.914 | 0.04 ### (100, 7111) RF 0.902, 0.0679
# vectorizer = CountVectorizer(strip_accents='unicode', ngram_range=(n,n), analyzer='word', min_df=15) #(100, 22673) RF 0.914 | 0.04 ### (100, 7111) RF 0.902, 0.0679
# vectorizer = CountVectorizer(strip_accents='unicode', ngram_range=(n,n), analyzer='char', min_df=5) #(100, 98652) RF 0.874 | 0.050 ### (100, 3723) RF 0.872, 0.0933
features_train = vectorizer.fit_transform(generator_pdf_train).toarray()
labels_train = np.array(labels_train)[decoded_indices_train.astype(bool)]

Number of PDFs: 1000
get_rtf_generator() iter 0 done
get_rtf_generator() iter 100 done
get_rtf_generator() iter 200 done
get_rtf_generator() iter 300 done
get_rtf_generator() iter 400 done
get_rtf_generator() iter 500 done
get_rtf_generator() iter 600 done
get_rtf_generator() iter 700 done
get_rtf_generator() iter 800 done
get_rtf_generator() iter 900 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError hilqurvvrbpinjkf.1


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError npvjmiyevbdkzxex.1


In [41]:
##### 1.2 Save/load data #####

filename_features, filename_labels = 'features_pdfid_pdfparser_train.npy', 'labels_train.npy'
# np.save(filename_features, features_train)
# np.save(filename_labels, labels_train)
# features_train = np.load(filename_features)
# labels_train = np.load(filename_labels)

features_train.shape

(997, 18422)

## 2 Train model

### 2.0 K-fold cross-validation

In [4]:
def train_evaluate(model, features, labels, model_name='base-model', n_splits=5, fit_whole_dataset=True):
    # K-Fold CV
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    accuracy_list, var_list = [], []
    split = 0
    
    # Training + metrics
    for idx_train, idx_eval in kf.split(features):
        X_train, X_eval, y_train, y_eval = features[idx_train], features[idx_eval], labels[idx_train], labels[idx_eval]
        # Fit the model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_eval)
        # Evaluate 
        acc = balanced_accuracy_score(y_eval, y_pred)
        accuracy_list += [acc*len(y_eval)]
        var_list += [acc]
        print(f'Split {split} done')
        split += 1
        
    # Compute CV_score
    cvscore = sum(accuracy_list)/len(labels)
    variance = np.std(var_list)
    print(f'{model_name}: CV-score = {cvscore:.3f}, Variance = {variance:.4f}\n')

    # Train the model on the whole Train dataset
    if fit_whole_dataset:
        model.fit(features, labels)
    
    return cvscore, variance

### 2.1 Train different models

In [5]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=0, warm_start=True, n_estimators=1000)
%time _ = train_evaluate(rf_model, features_train, labels_train, model_name="RFC()", n_splits=5, fit_whole_dataset=True)

Split 0 done
Split 1 done


  warn(
  warn(
  warn(


Split 2 done
Split 3 done
Split 4 done
RFC(): CV-score = 0.994, Variance = 0.0110

CPU times: user 7.21 s, sys: 143 ms, total: 7.35 s
Wall time: 7.35 s


  warn(
  warn(


In [None]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(warm_start=True)
%time _ = train_evaluate(gb_model, features_train, labels_train, model_name="GBC()", n_splits=5, fit_whole_dataset=False)

In [37]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
%time _ = train_evaluate(dt_model, features_train, labels_train, model_name="DTC()", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
DTC(): CV-score = 0.930, Variance = 0.0099

CPU times: user 2.51 s, sys: 129 ms, total: 2.64 s
Wall time: 2.63 s


In [38]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
%time _ = train_evaluate(svc_model, features_train, labels_train, model_name="SVC()", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
SVC(): CV-score = 0.727, Variance = 0.0657

CPU times: user 2min, sys: 3min 1s, total: 5min 1s
Wall time: 30.3 s


## 3 Predict on test dataset

In [7]:
### VARIABLES ###
# pdf_filenames_test = zipfile.ZipFile('pdf-test.zip').namelist()[:100]
pdf_filenames_test = zipfile.ZipFile('pdf-test.zip').namelist()
print(f'Number of PDFs: {len(pdf_filenames_test)}')
decode = False
n = 3 # Value for n-grams

labels_test, decoded_indices_test = [], np.ones(len(pdf_filenames_test))
generator_pdf_test = get_pdf_generator(pdf_filenames_test, labels_test, decoded_indices_test, train=False)

features_test = vectorizer.transform(generator_pdf_test).toarray()
labels_test = np.array(labels_test)[decoded_indices_test.astype(bool)]

##### 1.2 Save/load data #####

filename_features, filename_labels = 'features_pdfid_pdfparser_test_3.npy', 'labels_test_3.npy'
np.save(filename_features, features_test)
np.save(filename_labels, labels_test)
features_test.shape

Number of PDFs: 2895
get_rtf_generator() iter 0 done
get_rtf_generator() iter 100 done
get_rtf_generator() iter 200 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 385, in GetObject
    if self.token2[0] == CHAR_REGULAR:
       ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError chudaylojkynllpz.x
get_rtf_generator() iter 300 done
get_rtf_generator() iter 400 done
get_rtf_generator() iter 500 done
get_rtf_generator() iter 600 done
get_rtf_generator() iter 700 done
get_rtf_generator() iter 800 done
get_rtf_generator() iter 900 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError soncozobmukqkolb.x
get_rtf_generator() iter 1000 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError llpehvnmkaajawjw.x


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError eoecdodiwgupffus.x
get_rtf_generator() iter 1100 done
get_rtf_generator() iter 1200 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError dotpeuwoghyosgby.x
get_rtf_generator() iter 1300 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError qcizfoskpxmbjxbd.x
get_rtf_generator() iter 1400 done
get_rtf_generator() iter 1500 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError hrnmndpusbrvbhxo.x
get_rtf_generator() iter 1600 done
get_rtf_generator() iter 1700 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 437, in GetObject
    if self.token3[1] == 'obj':
       ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError gnzxkrctonffymgh.x
get_rtf_generator() iter 1800 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError uyuatzshsjpabtrw.x
get_rtf_generator() iter 1900 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError mgacvtehynhhxvbb.x
get_rtf_generator() iter 2000 done
get_rtf_generator() iter 2100 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError nckeiqfnjbbjnwwt.x
get_rtf_generator() iter 2200 done
get_rtf_generator() iter 2300 done
get_rtf_generator() iter 2400 done
get_rtf_generator() iter 2500 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError oyxqiqoatvpebmac.x
get_rtf_generator() iter 2600 done
get_rtf_generator() iter 2700 done


Traceback (most recent call last):
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1810, in <module>
    Main()
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 1588, in Main
    object = oPDFParser.GetObject()
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/stud09/env/python/unit2/pdf/pdf-parser.py", line 435, in GetObject
    if IsNumeric(self.token2[1]):
                 ~~~~~~~~~~~^^^
TypeError: 'NoneType' object is not subscriptable


CalledProcessError hkbefgisefkdawiz.x
get_rtf_generator() iter 2800 done


IndexError: boolean index did not match indexed array along dimension 0; dimension is 0 but corresponding boolean dimension is 2895

In [None]:
##### Predict of test dataset  #####

X_test = features_test
y_pred = rf_model.predict(X_test)


##### Save submission #####

submission = []
i_corr = 0
# Write the prediction as expected output
for i, filename in enumerate(pdf_filenames_test):
    if decoded_indices_test[i]:
        submission += [filename + ';' + y_pred[i-i_corr].astype(str)]
    else: # if email hasn't been decoded and thus predicted, we randomly choose its class / assign it to class 1
        # submission += [filename + ';' + str(np.random.randint(2))]
        submission += [filename + ';' + str(1)]
        i_corr += 1
print(f'Length of our submission: {len(submission)} | Length of zip file: {len(pdf_filenames_test)}')
# Save the output as a text file
np.savetxt('output_rf_n3_pdfparser_3.csv', np.array(submission), fmt='%s', delimiter=',')