# RTF

In [1]:
import psutil
import zipfile
import chardet
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
# from striprtf.striprtf import rtf_to_text

## 0 Current RAM usage monitoring

In [5]:
def get_ram_usage():
    memory_info = psutil.Process().memory_info()
    return memory_info.rss / (1024 * 1024 * 1024)  # Resident Set Size (RSS) in bytes

print(f"Current RAM usage: {get_ram_usage():.2f} / 24 GB ({get_ram_usage()/24*100:.1f} %)")

Current RAM usage: 1.61 / 24 GB (6.7 %)


## 1 Preprocess data

In [12]:
##### Get data -> Try decoding it with the second read_rtf_file commented function #####

# List of the .rtf filenames of our train or test dataset
rtf_filenames_train = zipfile.ZipFile('rtf-train.zip').namelist()[:-1]
decode = True
n = 2 # Value for n-grams

# Given a path, returns an .rtf as string
def read_rtf_file(file_path):
    # Read file in bianary
    with open(file_path, 'r') as file:
        rtf_content = file.read()
    # text = rtf_to_text(rtf_content)
    return rtf_content

# Given a path, returns an .rtf as string after having decoded it
def read_rtf_file_decoded(file_path):
    # Read file in bianary
    with open(file_path, 'rb') as file:
        rtf_content = file.read()
    # Decode the file
    encoding = chardet.detect(rtf_content)["encoding"]
    rtf_content = rtf_content.decode(encoding if encoding != None else 'utf-8')
    return rtf_content

def get_rtf_generator(rtf_filenames_train, labels_train, decoded_indices_train, decode, train=True):
    errors_train = 0
    for i, rtfname in enumerate(np.array(rtf_filenames_train)[decoded_indices_train.astype(bool)]): # Only the decoded filenames
        if not train and i in [147, 150]:
            print(i, rtfname, 'Refuse to decode: too long')
            errors_train += 1
            decoded_indices_train[i] = 0
            continue
        try:
            rtf = read_rtf_file_decoded(rtfname) if decode else read_rtf_file(rtfname)
            labels_train += [int(rtfname[-1])] if train else []   
            # if i % 100 == 0:
            #     print(f'get_rtf_generator() iter {i} done')
            yield rtf
        except UnicodeDecodeError:
            # Deal with decoding error
            print(i, rtfname, 'UnicodeDecodeError')
            errors_train += 1
            decoded_indices_train[i] = 0
            continue
        except TypeError:
            print(i, rtfname, 'TypeError')
            errors_train += 1
            decoded_indices_train[i] = 0
            continue
        except KeyError:
            print(i, rtfname, 'KeyError')
            errors_train += 1
            decoded_indices_train[i] = 0
            continue
    print('Number of non-decoded sequences:', errors_train)
    print('Percentage of non-decoded sequences:', round(errors_train/len(rtf_filenames_train)*100, 2), '%')


# # labels_train, decoded_indices_train = get_decoded_indices_labels(rtf_filenames_train, decode=decode)
# labels_train, decoded_indices_train = [], np.ones(len(rtf_filenames_train))
# generator_rtf_train = get_rtf_generator(rtf_filenames_train, labels_train, decoded_indices_train, decode=decode, train=True)


# ##### Extract features #####

# vectorizer = CountVectorizer(strip_accents='unicode', lowercase=True, ngram_range=(n,n), analyzer='char', min_df=1)
# features_train = vectorizer.fit_transform(generator_rtf_train)
# features_train = features_train.toarray()

In [4]:
##### Save/load data #####

# np.save('features_train_n2_df1.npy', features_train)
# np.save('labels_train.npy', labels_train)
features_train = np.load('features_train_n2_df1.npy')
labels_train = np.load('labels_train.npy')
print(f'{features_train.shape = }')
print(f'{len(labels_train) = }')

features_train.shape = (3809, 47298)
len(labels_train) = 3809


## 2 Train model

### 2.0 K-fold cross-validation

In [9]:
def train_evaluate(model, features, labels, model_name='base-model', n_splits=5, fit_whole_dataset=True):
    # K-Fold CV
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    accuracy_list, var_list = [], []
    split = 0
    
    # Training + metrics
    for idx_train, idx_eval in kf.split(features):
        X_train, X_eval, y_train, y_eval = features[idx_train], features[idx_eval], labels[idx_train], labels[idx_eval]
        # Fit the model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_eval)
        # Evaluate 
        acc = balanced_accuracy_score(y_eval, y_pred)
        accuracy_list += [acc*len(y_eval)]
        var_list += [acc]
        print(f'Split {split} done')
        split += 1
        
    # Compute CV_score
    cvscore = sum(accuracy_list)/len(labels)
    variance = np.std(var_list)
    print(f'{model_name}: CV-score = {cvscore:.3f}, Variance = {variance:.4f}\n')

    # Train the model on the whole Train dataset
    if fit_whole_dataset:
        model.fit(features, labels)
    
    return cvscore, variance

### 2.1 `n-gram==1` (500 training ex)

In [30]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name="GBC() n-gram=1", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC() n-gram=1: CV-score = 0.951, Variance = 0.0402

CPU times: user 2.02 s, sys: 0 ns, total: 2.02 s
Wall time: 2.01 s


In [31]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
%time _ = train_evaluate(rf_model, features_train, np.array(labels_train), model_name="RFC() n-gram=1", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
RFC() n-gram=1: CV-score = 0.927, Variance = 0.0388

CPU times: user 720 ms, sys: 339 µs, total: 720 ms
Wall time: 717 ms


In [32]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
%time _ = train_evaluate(dt_model, features_train, np.array(labels_train), model_name="DTC() n-gram=1", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
DTC() n-gram=1: CV-score = 0.917, Variance = 0.0431

CPU times: user 45.3 ms, sys: 774 µs, total: 46 ms
Wall time: 44.5 ms


In [33]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
%time _ = train_evaluate(svc_model, features_train, np.array(labels_train), model_name="SVC() n-gram=1", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
SVC() n-gram=1: CV-score = 0.759, Variance = 0.0389

CPU times: user 22.8 ms, sys: 0 ns, total: 22.8 ms
Wall time: 22 ms


### 2.2 `n-gram==2` (500 training ex): BEST

In [67]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name="GBC() n-gram=2", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC() n-gram=2: CV-score = 0.958, Variance = 0.0292

CPU times: user 22 s, sys: 7.69 ms, total: 22 s
Wall time: 22 s


In [24]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
%time _ = train_evaluate(rf_model, features_train, np.array(labels_train), model_name="RFC() n-gram=2", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
RFC() n-gram=2: CV-score = 0.921, Variance = 0.0423

CPU times: user 1.05 s, sys: 3.18 ms, total: 1.05 s
Wall time: 1.05 s


In [25]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
%time _ = train_evaluate(dt_model, features_train, np.array(labels_train), model_name="DTC() n-gram=2", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
DTC() n-gram=2: CV-score = 0.938, Variance = 0.0416

CPU times: user 296 ms, sys: 438 µs, total: 297 ms
Wall time: 294 ms


In [26]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
%time _ = train_evaluate(svc_model, features_train, np.array(labels_train), model_name="SVC() n-gram=2", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
SVC() n-gram=2: CV-score = 0.777, Variance = 0.0486

CPU times: user 125 ms, sys: 59 µs, total: 125 ms
Wall time: 122 ms


### 2.3 `n-gram==3` (500 training ex)

In [14]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name="GBC() n-gram=3", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC() n-gram=3: CV-score = 0.952, Variance = 0.0374

CPU times: user 1min 27s, sys: 18.5 ms, total: 1min 27s
Wall time: 1min 27s


In [15]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
%time _ = train_evaluate(rf_model, features_train, np.array(labels_train), model_name="RFC() n-gram=3", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
RFC() n-gram=3: CV-score = 0.927, Variance = 0.0351

CPU times: user 1.55 s, sys: 15.8 ms, total: 1.57 s
Wall time: 1.56 s


In [16]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
%time _ = train_evaluate(dt_model, features_train, np.array(labels_train), model_name="DTC() n-gram=3", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
DTC() n-gram=3: CV-score = 0.937, Variance = 0.0524

CPU times: user 1.35 s, sys: 15.9 ms, total: 1.36 s
Wall time: 1.36 s


In [17]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
%time _ = train_evaluate(svc_model, features_train, np.array(labels_train), model_name="SVC() n-gram=3", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
SVC() n-gram=3: CV-score = 0.713, Variance = 0.0639

CPU times: user 9.56 s, sys: 16.9 s, total: 26.5 s
Wall time: 2.67 s


### 2.4 `n-gram==4` (500 training ex)

In [37]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name="GBC() n-gram=4", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC() n-gram=4: CV-score = 0.943, Variance = 0.0453

CPU times: user 9min 6s, sys: 221 ms, total: 9min 6s
Wall time: 9min 6s


In [38]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
%time _ = train_evaluate(rf_model, features_train, np.array(labels_train), model_name="RFC() n-gram=4", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
RFC() n-gram=4: CV-score = 0.934, Variance = 0.0287

CPU times: user 3.26 s, sys: 144 ms, total: 3.4 s
Wall time: 3.4 s


In [39]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
%time _ = train_evaluate(dt_model, features_train, np.array(labels_train), model_name="DTC() n-gram=4", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
DTC() n-gram=4: CV-score = 0.944, Variance = 0.0297

CPU times: user 6.73 s, sys: 160 ms, total: 6.89 s
Wall time: 6.89 s


In [40]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
%time _ = train_evaluate(svc_model, features_train, np.array(labels_train), model_name="SVC() n-gram=4", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
SVC() n-gram=4: CV-score = 0.691, Variance = 0.0509

CPU times: user 56.3 s, sys: 1min 16s, total: 2min 12s
Wall time: 13.6 s


### 2.5 `analyzer='word'` and `n-gram==1` (500 training ex)¶

In [49]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name=f"GBC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC() n-gram=1: CV-score = 0.931, Variance = 0.0567

CPU times: user 7.33 s, sys: 108 µs, total: 7.33 s
Wall time: 7.32 s


In [50]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
%time _ = train_evaluate(rf_model, features_train, np.array(labels_train), model_name=f"RFC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
RFC() n-gram=1: CV-score = 0.936, Variance = 0.0377

CPU times: user 571 ms, sys: 0 ns, total: 571 ms
Wall time: 566 ms


In [51]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
%time _ = train_evaluate(dt_model, features_train, np.array(labels_train), model_name=f"DTC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
DTC() n-gram=1: CV-score = 0.935, Variance = 0.0551

CPU times: user 98.9 ms, sys: 0 ns, total: 98.9 ms
Wall time: 95.8 ms


In [52]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
%time _ = train_evaluate(svc_model, features_train, np.array(labels_train), model_name=f"SVC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
SVC() n-gram=1: CV-score = 0.569, Variance = 0.0398

CPU times: user 132 ms, sys: 3.59 ms, total: 136 ms
Wall time: 133 ms


### 2.6 `analyzer='word'` and `n-gram==2` (500 training ex)¶

In [57]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name=f"GBC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC() n-gram=2: CV-score = 0.934, Variance = 0.0401

CPU times: user 6.98 s, sys: 0 ns, total: 6.98 s
Wall time: 6.97 s


In [58]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
%time _ = train_evaluate(rf_model, features_train, np.array(labels_train), model_name=f"RFC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
RFC() n-gram=2: CV-score = 0.930, Variance = 0.0416

CPU times: user 557 ms, sys: 294 µs, total: 557 ms
Wall time: 552 ms


In [59]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
%time _ = train_evaluate(dt_model, features_train, np.array(labels_train), model_name=f"DTC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
DTC() n-gram=2: CV-score = 0.910, Variance = 0.0353

CPU times: user 174 ms, sys: 3.34 ms, total: 178 ms
Wall time: 175 ms


In [60]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
%time _ = train_evaluate(svc_model, features_train, np.array(labels_train), model_name=f"SVC() n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
SVC() n-gram=2: CV-score = 0.500, Variance = 0.0000

CPU times: user 173 ms, sys: 2.77 ms, total: 176 ms
Wall time: 173 ms


### 2.7  `analyzer='char'` and `n-gram==2` and `min_leaf==2` (`loss`, `lr`, `n_est`, `crit`, `max_depth` don't improve perf) (500 training ex)

In [98]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(min_samples_leaf=2, random_state=0, warm_start=True)
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name=f"GBC(min_samples_leaf=2, random_state=0, warm_start=True) n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC(min_samples_leaf=2, random_state=0, warm_start=True) n-gram=2: CV-score = 1.000, Variance = 0.0000

CPU times: user 4.26 s, sys: 190 µs, total: 4.26 s
Wall time: 4.26 s


### 2.8  `analyzer='char'` and `n-gram==2` and `min_leaf==2` (`loss`, `lr`, `n_est`, `crit`, `max_depth` don't improve perf)

In [10]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(min_samples_leaf=2, random_state=0, warm_start=True)
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name=f"GBC(min_s_leaf=2, rs=0, warm_start=True) n-gram={n}", n_splits=5, fit_whole_dataset=False)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
GBC(min_s_leaf=2, rs=0, warm_start=True) n-gram=2: CV-score = 0.995, Variance = 0.0085

CPU times: user 5min 14s, sys: 1.18 s, total: 5min 15s
Wall time: 5min 15s


## 3 Prediction on test dataset

In [18]:
##### Get data -> Try decoding it with the second read_rtf_file commented function #####

# List of the .rtf filenames of our train or test dataset
rtf_filenames_test = zipfile.ZipFile('rtf-test.zip').namelist()
# rtf_filenames_test = rtf_filenames_test[:147] + rtf_filenames_test[148:150] + rtf_filenames_test[151:] # files 147 and 150 are pbtq
decode = True

# labels_test, decoded_indices_test = get_decoded_indices_labels(rtf_filenames_test, decode=decode, train=False)
labels_test, decoded_indices_test = [], np.ones(len(rtf_filenames_test))
generator_rtf_test = get_rtf_generator(rtf_filenames_test, labels_test, decoded_indices_test, decode=decode, train=False)


##### Extract features #####

features_test = vectorizer.transform(generator_rtf_test)
features_test = features_test.toarray()

48 data/rtf-2017-09/apzvmnmcobbgxqcj.x UnicodeDecodeError
74 data/rtf-2017-09/zltktsghvhtowhrs.x UnicodeDecodeError
78 data/rtf-2017-09/aagxsnlsqujyoonf.x UnicodeDecodeError
122 data/rtf-2017-09/duxajsiflswpxfmw.x UnicodeDecodeError
129 data/rtf-2017-09/gwhsyfgsjmhalbvr.x UnicodeDecodeError
147 data/rtf-2017-09/rgkcktzjeljqywek.x Refuse to decode: too long
150 data/rtf-2017-09/qaiufqizlqdyflor.x Refuse to decode: too long
171 data/rtf-2017-09/qqzxuqhsnrxgxqws.x UnicodeDecodeError
241 data/rtf-2017-09/agwzmflnvhofyjql.x UnicodeDecodeError
252 data/rtf-2017-09/fsptojetbksyxttn.x UnicodeDecodeError
286 data/rtf-2017-09/kyhtbqxglbjoogbw.x UnicodeDecodeError
319 data/rtf-2017-09/wouvgrffdzymgihn.x UnicodeDecodeError
328 data/rtf-2017-09/ywewuzlgvtwcyhag.x UnicodeDecodeError
337 data/rtf-2017-09/uvnalkzosteedygr.x UnicodeDecodeError
341 data/rtf-2017-09/gkxvvkydpepdfege.x UnicodeDecodeError
365 data/rtf-2017-09/ajqyvrgusehfnzjl.x UnicodeDecodeError
384 data/rtf-2017-09/ylrpshvilzximekz.x Uni

In [19]:
##### Save/load data #####

# np.save('features_test_n2_df1.npy', features_test)
features_test = np.load('features_test_n2_df1.npy')
print(f'{features_test.shape = }')

features_test.shape = (1767, 47298)


In [20]:
##### Predict of test dataset with SVC #####

X_test = features_test
y_pred = gb_model.predict(X_test)

In [21]:
submission = []
i_corr = 0
# Write the prediction as expected output
for i, filename in enumerate(rtf_filenames_test):
    if decoded_indices_test[i]:
        submission += [filename + ';' + y_pred[i-i_corr].astype(str)]
    else: # if email hasn't been decoded and thus predicted, we randomly choose its class / assign it to class 1
        # submission += [filename + ';' + str(np.random.randint(2))]
        submission += [filename + ';' + str(1)]
        i_corr += 1
print(f'Length of our submission: {len(submission)} | Length of zip file: {len(rtf_filenames_test)}')
# Save the output as a text file
np.savetxt('output_gb_n2_df1.csv', np.array(submission), fmt='%s', delimiter=',')

Length of our submission: 1847 | Length of zip file: 1847
