In [1]:
from functools import reduce

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN

In [2]:
def char_concatenate(series):
    return reduce(lambda x, y: x + y, series) # concatenates chars in the same cluster

In [3]:
def word_concatenate(series):
    return reduce(lambda x, y: x + ' ' + y, series)

In [4]:
df = pd.read_csv('./leaflets/pdfs_chars_output/RO43-1000_chars.csv', encoding='utf-8 sig')

In [16]:
# remove char duplicates
df.drop_duplicates(inplace=True)

In [17]:
# Create empty resulting dataframe to add page groups into it
group_col_names = ['page_number', 'chars', 'x0', 'x1', 'y0', 'y1']
groups_df = pd.DataFrame(columns = group_col_names)

In [18]:
Y_SCALER = 5 # used to size up the y axis to separate text rows from each other (helps cluster words better, as they are close on the x axis)
HP_EPS_WORD = 0.055 # best hyperparameter for DBSCAN that clusters chars into words
HP_EPS_GROUP = 0.035 # best hyperparameter for DBSCAN that clusters words into semantic groups

features = ['x_center', 'y_center', 'height']

page_numbers = df['page_number'].unique()
for page in page_numbers:
    page_df = df[df['page_number'] == page]
    
    page_df['x_center'] = (page_df['x0'] + page_df['x1']) / 2
    page_df['y_center'] = (page_df['y0'] + page_df['y1']) / 2
    
    char_scaler = MinMaxScaler()
    X_char_scaled = char_scaler.fit_transform(page_df[features])
    X_char_scaled[:,1] *= Y_SCALER
    
    char_clustering = DBSCAN(eps=HP_EPS_WORD, min_samples=1)
    char_clustering.fit(X_char_scaled)
    
    page_df['labels'] = char_clustering.labels_
    page_words = page_df.groupby('labels', as_index=False).agg({'page_number': pd.Series.mode,
                                                'chars': char_concatenate,
                                                'x0': min,
                                                'x1': max,
                                                'y0': min,
                                                'y1': max})
    
    page_words['height'] = page_words['y1'] - page_words['y0']
    page_words['x_center'] = (page_words['x0'] + page_words['x1']) / 2
    page_words['y_center'] = (page_words['y0'] + page_words['y1']) / 2
    
    word_scaler = MinMaxScaler()
    X_word_scaled = word_scaler.fit_transform(page_words[features])
    
    group_clustering = DBSCAN(eps=HP_EPS_GROUP, min_samples=1)
    group_clustering.fit(X_word_scaled)
    
    page_words['labels'] = group_clustering.labels_
    
    page_groups = page_words.groupby('labels', as_index=False).agg({'page_number': pd.Series.mode,
                                                                      'chars': word_concatenate,
                                                                      'x0': min,
                                                                      'x1': max,
                                                                      'y0': min,
                                                                      'y1': max})
    
    groups_df = pd.concat([groups_df, page_groups])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page_df['x_center'] = (page_df['x0'] + page_df['x1']) / 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page_df['y_center'] = (page_df['y0'] + page_df['y1']) / 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page_df['labels'] = char_clustering.labels_
A value is trying to be set on a copy of a sl

In [19]:
groups_df['labels'] = groups_df['labels'].astype(int)

In [21]:
groups_df.to_csv('leaflets/pdfs_word_groups_output/RO43-1000_groups.csv', encoding='utf-8 sig', index=False)