# Introdução

Este notebook contém o código para gerar o dataset e para treinar um classificador de detecção de seções.
Os dados foram extraídos da base [GROTOAP2](https://www.dlib.org/dlib/november14/tkaczyk/11tkaczyk.html).

# Setup

## Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Imports

In [None]:
BASE_GROTOAP2_DIR = './gdrive/MyDrive/Colab Notebooks/doutorado/dataset/grotoa_section_detection/'

In [None]:
import pandas as pd
import numpy as np

# Concatenação dos dataframes

In [None]:
!unzip './gdrive/MyDrive/Colab Notebooks/doutorado/dataset/grotoa_section_detection/section_detection.zip'
!mv *.csv './gdrive/MyDrive/Colab Notebooks/doutorado/dataset/grotoa_section_detection'

Archive:  ./gdrive/MyDrive/Colab Notebooks/doutorado/dataset/grotoa_section_detection/section_detection.zip
  inflating: output0.csv             
  inflating: output1.csv             
  inflating: output10.csv            
  inflating: output11.csv            
  inflating: output12.csv            
  inflating: output2.csv             
  inflating: output3.csv             
  inflating: output4.csv             
  inflating: output5.csv             
  inflating: output6.csv             
  inflating: output7.csv             
  inflating: output8.csv             
  inflating: output9.csv             


In [None]:
df_output0 = pd.read_csv(BASE_GROTOAP2_DIR + 'output0.csv', sep="\t", encoding='utf-8')

In [None]:
df_output0.columns

Index(['has_most_frequent_font', 'font_magnitude', 'font_variety',
       'is_roman_number_prefixed', 'above_font_threshold', 'word_count',
       'char_count', 'first_capitalized', 'all_words_capitalized',
       'all_italic', 'all_bold', 'number_prefixed', 'number_count',
       'font_size', 'abs_left', 'has_punctuation', 'abs_top', 'text', 'pmc_id',
       'file_name', 'heading', 'line_idx', 'original_title'],
      dtype='object')

In [None]:
columns_dtype = {x: str(y) for x, y in df_output0.dtypes.items()}

In [None]:
list_files = ['output' + str(i) + '.csv' for i in range(13)]

In [None]:
frames = []
for file in list_files:
    frames.append(pd.read_csv(BASE_GROTOAP2_DIR + file, sep="\t", encoding='utf-8', dtype=columns_dtype))

In [None]:
result = pd.concat(frames)

In [None]:
result.to_csv(BASE_GROTOAP2_DIR + 'dataset_original.tsv', encoding='utf-8', index=False, sep="\t")

In [None]:
result.columns

Index(['has_most_frequent_font', 'font_magnitude', 'font_variety',
       'is_roman_number_prefixed', 'above_font_threshold', 'word_count',
       'char_count', 'first_capitalized', 'all_words_capitalized',
       'all_italic', 'all_bold', 'number_prefixed', 'number_count',
       'font_size', 'abs_left', 'has_punctuation', 'abs_top', 'text', 'pmc_id',
       'file_name', 'heading', 'line_idx', 'original_title'],
      dtype='object')

In [None]:
del frames
del result

# Verificação do Dataset

In [None]:
df_original = pd.read_csv(BASE_GROTOAP2_DIR + 'dataset_original.tsv', sep="\t", encoding='utf-8', dtype=columns_dtype)

In [None]:
df_original.columns

Index(['has_most_frequent_font', 'font_magnitude', 'font_variety',
       'is_roman_number_prefixed', 'above_font_threshold', 'word_count',
       'char_count', 'first_capitalized', 'all_words_capitalized',
       'all_italic', 'all_bold', 'number_prefixed', 'number_count',
       'font_size', 'abs_left', 'has_punctuation', 'abs_top', 'text', 'pmc_id',
       'file_name', 'heading', 'line_idx', 'original_title'],
      dtype='object')

Total de artigos

In [None]:
df_original.pmc_id.nunique()

12934

In [None]:
df_grotoa = pd.read_csv(BASE_GROTOAP2_DIR + 'sections_grotoa_2021_06_16_16_56_18.tsv', sep="\t", encoding='utf-8')

In [None]:
df_grotoa.pmc_id.nunique()

12934

In [None]:
A = set(df_grotoa.pmc_id.unique())
B = set(df_original.pmc_id.unique())
A - B

set()

## Títulos nxml

In [None]:
print(f'Total de Títulos encontrados nos nxml: {len(df_grotoa.index):,.0f}')

Total de Títulos encontrados nos nxml: 176,077


## Dataset Original

In [None]:
print(f'Total de Amostras nos dados originais: {len(df_original.index):,.0f}')

Total de Amostras nos dados originais: 11,955,640


# Tratamento nos dados

### Remoção de Duplicatas

In [None]:
df_original = df_original.drop_duplicates(keep='first')
print(f'{len(df_original.index):,.0f}')

11,749,690


### Manter apenas textos válidos

In [None]:
df_original = df_original[~df_original['text'].isnull()]
print(f'{len(df_original.index):,.0f}')

11,596,996


In [None]:
df_original = df_original[df_original['text'] != '']
print(f'{len(df_original.index):,.0f}')

11,596,996


# Geração Dataset Final

In [None]:
df_grotoa_count_sections = df_grotoa.groupby(['pmc_id']).size().reset_index(name='counts')

In [None]:
df_original_is_section = df_original[(df_original['heading'] == 1) & (df_original['line_idx'] == 1)]

In [None]:
df_original_count_sections = df_original_is_section.groupby(['pmc_id']).size().reset_index(name='counts')

In [None]:
df_merge = pd.merge(df_grotoa_count_sections, df_original_count_sections, on='pmc_id')

In [None]:
file_list = df_merge[(df_merge['counts_x'] == df_merge['counts_y'])]['pmc_id'].values

In [None]:
df_original = df_original[df_original['pmc_id'].isin(file_list)]
print(f'{len(df_original.index):,.0f}')

5,284,792


In [None]:
del df_grotoa_count_sections
del df_original_is_section
del df_original_count_sections
del df_merge

## Sqlite
Remoção dos itens com conflitos

In [None]:
df_original.columns

Index(['has_most_frequent_font', 'font_magnitude', 'font_variety',
       'is_roman_number_prefixed', 'above_font_threshold', 'word_count',
       'char_count', 'first_capitalized', 'all_words_capitalized',
       'all_italic', 'all_bold', 'number_prefixed', 'number_count',
       'font_size', 'abs_left', 'has_punctuation', 'abs_top', 'text', 'pmc_id',
       'file_name', 'heading', 'line_idx', 'original_title'],
      dtype='object')

In [None]:
df_original.dtypes

has_most_frequent_font        int64
font_magnitude                int64
font_variety                  int64
is_roman_number_prefixed      int64
above_font_threshold          int64
word_count                    int64
char_count                    int64
first_capitalized             int64
all_words_capitalized         int64
all_italic                    int64
all_bold                      int64
number_prefixed               int64
number_count                  int64
font_size                   float64
abs_left                    float64
has_punctuation               int64
abs_top                     float64
text                         object
pmc_id                       object
file_name                    object
heading                       int64
line_idx                      int64
original_title               object
dtype: object

In [None]:
import sqlite3

conn = sqlite3.connect('test.db')

conn.execute('''
CREATE TABLE IF NOT EXISTS dataset(
 has_most_frequent_font      real,
font_magnitude                int,
font_variety                  int,
is_roman_number_prefixed      int,
above_font_threshold          int,
word_count                    int,
char_count                    int,
first_capitalized             int,
all_words_capitalized         int,
all_italic                    int,
all_bold                      int,
number_prefixed               int,
number_count                  int,
font_size                   real,
abs_left                    real,
has_punctuation               int,
abs_top                     real,
text                         text,
pmc_id                       text,
file_name                    text,
heading                       int,
line_idx                      int,
original_title               text);''')

conn.commit()
conn.close()

In [None]:
output = df_original.itertuples(index=False)
data = tuple(output)

In [None]:
wildcards = ','.join(['?'] * 23)

In [None]:
insert_sql = 'INSERT INTO dataset VALUES (%s)' % wildcards

In [None]:
conn = sqlite3.connect('test.db')
conn.executemany(insert_sql, data)
conn.commit()
conn.close()

In [None]:
del df_original

In [None]:
conn = sqlite3.connect('test.db')

conn.execute('''CREATE TABLE problema AS SELECT above_font_threshold,
         abs_left,
         abs_top,
         all_bold,
         all_italic,
         all_words_capitalized,
         char_count,
         first_capitalized,
         font_magnitude,
         font_size,
         font_variety,
         has_most_frequent_font,
         has_punctuation,
         is_roman_number_prefixed,
         number_count,
         number_prefixed,
         word_count,
         COUNT(DISTINCT heading)
FROM dataset
GROUP BY above_font_threshold,
         abs_left,
         abs_top,
         all_bold,
         all_italic,
         all_words_capitalized,
         char_count,
         first_capitalized,
         font_magnitude,
         font_size,
         font_variety,
         has_most_frequent_font,
         has_punctuation,
         is_roman_number_prefixed,
         number_count,
         number_prefixed,
         word_count
HAVING COUNT(DISTINCT heading) > 1;''')
conn.commit()
conn.close()

In [None]:
conn = sqlite3.connect('test.db')

conn.execute('''
DELETE FROM dataset
WHERE (above_font_threshold,
         abs_left,
         abs_top,
         all_bold,
         all_italic,
         all_words_capitalized,
         char_count,
         first_capitalized,
         font_magnitude,
         font_size,
         font_variety,
         has_most_frequent_font,
         has_punctuation,
         is_roman_number_prefixed,
         number_count,
         number_prefixed,
         word_count) IN (SELECT above_font_threshold,
         abs_left,
         abs_top,
         all_bold,
         all_italic,
         all_words_capitalized,
         char_count,
         first_capitalized,
         font_magnitude,
         font_size,
         font_variety,
         has_most_frequent_font,
         has_punctuation,
         is_roman_number_prefixed,
         number_count,
         number_prefixed,
         word_count FROM problema)
         AND heading = 0
''')

conn.close()

In [None]:
conn = sqlite3.connect('test.db')

db_df = pd.read_sql_query("SELECT * FROM dataset", conn)
db_df.to_csv(BASE_GROTOAP2_DIR + 'dataset_final.tsv', encoding='utf-8', index=False, sep="\t")
conn.close()

In [None]:
db_df.columns

Index(['has_most_frequent_font', 'font_magnitude', 'font_variety',
       'is_roman_number_prefixed', 'above_font_threshold', 'word_count',
       'char_count', 'first_capitalized', 'all_words_capitalized',
       'all_italic', 'all_bold', 'number_prefixed', 'number_count',
       'font_size', 'abs_left', 'has_punctuation', 'abs_top', 'text', 'pmc_id',
       'file_name', 'heading', 'line_idx', 'original_title'],
      dtype='object')

In [None]:
db_df.dtypes

has_most_frequent_font      float64
font_magnitude                int64
font_variety                  int64
is_roman_number_prefixed      int64
above_font_threshold          int64
word_count                    int64
char_count                    int64
first_capitalized             int64
all_words_capitalized         int64
all_italic                    int64
all_bold                      int64
number_prefixed               int64
number_count                  int64
font_size                   float64
abs_left                    float64
has_punctuation               int64
abs_top                     float64
text                         object
pmc_id                       object
file_name                    object
heading                       int64
line_idx                      int64
original_title               object
dtype: object

# Treinamento

## Criação do conjunto de treinamento e do conjunto de teste

In [None]:
X = db_df[db_df.columns[0:17]]
y = db_df["heading"]

X = X.astype('float32')
y = y.astype('int')

X.to_csv(BASE_GROTOAP2_DIR + 'X.tsv', encoding='utf-8', index=False, sep="\t")
y.to_csv(BASE_GROTOAP2_DIR + 'y.tsv', encoding='utf-8', index=False, sep="\t")

In [None]:
del db_df

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.to_csv(BASE_GROTOAP2_DIR + 'X_train.tsv', encoding='utf-8', index=False, sep="\t")
X_test.to_csv(BASE_GROTOAP2_DIR + 'X_test.tsv', encoding='utf-8', index=False, sep="\t")
y_train.to_csv(BASE_GROTOAP2_DIR + 'y_train.tsv', encoding='utf-8', index=False, sep="\t")
y_test.to_csv(BASE_GROTOAP2_DIR + 'y_test.tsv', encoding='utf-8', index=False, sep="\t")

In [None]:
from sklearn.preprocessing import StandardScaler
from pickle import dump

scaler = StandardScaler()

scaler.fit(X_train)

X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_s, index=X_train.index, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_s, index=X_test.index, columns=X_test.columns)


dump(scaler, open(BASE_GROTOAP2_DIR + 'scaler.pkl', 'wb'))

In [None]:
X_train_scaled.to_csv(BASE_GROTOAP2_DIR + 'X_train_scaled.tsv', encoding='utf-8', index=False, sep="\t")
X_test_scaled.to_csv(BASE_GROTOAP2_DIR + 'X_test_scaled.tsv', encoding='utf-8', index=False, sep="\t")

## Treinamento

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
models = [('RandomForestClassifier', RandomForestClassifier()),]

In [None]:
model = models[0]
print('Model ' + model[0])
clf = model[1]
clf = clf.fit(X_train_scaled.values, y_train.values)
dump(clf, open(BASE_GROTOAP2_DIR + model[0] + '_im_scaled.pkl', 'wb'))

Model RandomForestClassifier


In [None]:
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = clf.predict(X_test_scaled.values)
report = classification_report(y_test.values, y_pred, output_dict=True)
recall = report['1']['recall']
specificity = report['0']['recall']
precision = report['1']['precision']
f1_score = report['1']['f1-score']
accuracy = accuracy_score(y_test, y_pred)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print('**** Resultados ****')
print("Recall: {:.3f}".format(recall))
print("Specificity: {:.3f}".format(specificity))
print("Precision: {:.3f}".format(precision))
print("f1-score: {:.3f}".format(f1_score))
print("Accuracy {:.3f}".format(accuracy))
print('**** Matriz Confusão ****')
print('TP: ' + str(tp))
print('FP: ' + str(fp))
print('FN: ' + str(fn))
print('TN: ' + str(tn))

print('**** Relatório ****')
print(report)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#scores = cross_val_score(clf, X, y.values.ravel(), cv=kfold, scoring='f1')
scores = cross_val_score(clf, X_train, y_train.values.ravel(), cv=kfold, scoring='f1')

print('**** Cross Validation ****')
print(scores)
print("F1 mean: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

**** Resultados ****
Recall: 0.920
Specificity: 0.999
Precision: 0.959
f1-score: 0.939
Accuracy 0.998
**** Matriz Confusão ****
TP: 25900
FP: 1120
FN: 2247
TN: 1556171
**** Relatório ****
{'0': {'precision': 0.9985581532040826, 'recall': 0.9992808023677013, 'f1-score': 0.9989193470892178, 'support': 1557291}, '1': {'precision': 0.9585492227979274, 'recall': 0.9201691121611539, 'f1-score': 0.9389671361502347, 'support': 28147}, 'accuracy': 0.9978762966448388, 'macro avg': {'precision': 0.9785536880010051, 'recall': 0.9597249572644276, 'f1-score': 0.9689432416197263, 'support': 1585438}, 'weighted avg': {'precision': 0.9978478565137409, 'recall': 0.9978762966448388, 'f1-score': 0.9978549882929106, 'support': 1585438}}
**** Cross Validation ****
[0.93983031 0.94382899 0.93883936 0.9326036  0.93705204 0.93952906
 0.94164394 0.93917123 0.93898252 0.94163607]
F1 mean: 0.939 (+/- 0.006)
