In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# Load the dataset
file_path = '../models/cell-classification/input/ready_to_train/tagged_cells_df.pkl'  # Update this to your file path
data = pd.read_pickle(file_path)

print(data.shape)

(10426, 1032)


In [2]:
data.head()

Unnamed: 0,cell_id,cell_type,entity_type,cell_words,cell_content,cell_width,cell_height,cell_left,cell_top,row_index,...,cell_content_tfidf_991,cell_content_tfidf_992,cell_content_tfidf_993,cell_content_tfidf_994,cell_content_tfidf_995,cell_content_tfidf_996,cell_content_tfidf_997,cell_content_tfidf_998,cell_content_tfidf_999,cell_role
0,76a754a3-6890-48c5-b479-f548860cfe50,TABLE_TITLE,,"[Nameplate, Information]",Nameplate Information,0.846682,0.021104,0.086272,0.181804,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,title
1,76366028-7abd-4e82-b3fb-c323eb486bbd,CHILD,TABLE_TITLE,"[Nameplate, Information]",Nameplate Information,0.236526,0.021361,0.086304,0.181983,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,title
2,7579fcb6-034c-457d-a186-391a167320ff,MERGED_CELL,TABLE_TITLE,[],,0.846682,0.021298,0.086272,0.182046,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,empty
3,7aeb3142-37da-4408-8cd1-47fa050b86de,CHILD,,[Manufacturer:],Manufacturer:,0.236559,0.019694,0.086271,0.20332,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,key
4,befba715-9e9a-48df-9688-4562d9b7f211,CHILD,,[Model:],Model:,0.236561,0.019694,0.08627,0.22299,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,key


In [3]:
# Fill missing values
data.fillna('', inplace=True)  # Handling missing values
print(data.shape)

(10426, 1032)


In [4]:
# Encode categorical data
label_encoder = LabelEncoder()
data['table_class_encoded'] = label_encoder.fit_transform(data['table_class'])
data['entity_type_encoded'] = label_encoder.fit_transform(data['entity_type'])
print(data.shape)

(10426, 1034)


In [5]:
# One-hot encode the table_type
data = pd.get_dummies(data, columns=['table_type'])
print(data.shape)

(10426, 1035)


In [6]:
data.head()

Unnamed: 0,cell_id,cell_type,entity_type,cell_words,cell_content,cell_width,cell_height,cell_left,cell_top,row_index,...,cell_content_tfidf_995,cell_content_tfidf_996,cell_content_tfidf_997,cell_content_tfidf_998,cell_content_tfidf_999,cell_role,table_class_encoded,entity_type_encoded,table_type_SEMI_STRUCTURED_TABLE,table_type_STRUCTURED_TABLE
0,76a754a3-6890-48c5-b479-f548860cfe50,TABLE_TITLE,,"[Nameplate, Information]",Nameplate Information,0.846682,0.021104,0.086272,0.181804,0,...,0.0,0.0,0.0,0.0,0.0,title,2,0,1,0
1,76366028-7abd-4e82-b3fb-c323eb486bbd,CHILD,TABLE_TITLE,"[Nameplate, Information]",Nameplate Information,0.236526,0.021361,0.086304,0.181983,1,...,0.0,0.0,0.0,0.0,0.0,title,2,3,1,0
2,7579fcb6-034c-457d-a186-391a167320ff,MERGED_CELL,TABLE_TITLE,[],,0.846682,0.021298,0.086272,0.182046,1,...,0.0,0.0,0.0,0.0,0.0,empty,2,3,1,0
3,7aeb3142-37da-4408-8cd1-47fa050b86de,CHILD,,[Manufacturer:],Manufacturer:,0.236559,0.019694,0.086271,0.20332,2,...,0.0,0.0,0.0,0.0,0.0,key,2,0,1,0
4,befba715-9e9a-48df-9688-4562d9b7f211,CHILD,,[Model:],Model:,0.236561,0.019694,0.08627,0.22299,3,...,0.0,0.0,0.0,0.0,0.0,key,2,0,1,0


In [None]:
# Prepare features and target
X = data[['table_class_encoded', 'row_index', 'column_index']]
y = data['cell_role']

# Categorize Cells, considering Context + Cell Content

In [30]:
# Preprocess data
data.fillna('', inplace=True)  # Handling missing values
data['combined_text'] = data['cell_content'] + " " + data['words_above'] + " " + data['words_left'] + " " + data['words_below'] + " " + data['words_right']




# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define TF-IDF Vectorization and preprocessing pipeline
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
preprocessor = ColumnTransformer(
    transformers=[
        ('text', tfidf_vectorizer, 'combined_text'),
        ('num', StandardScaler(), ['table_class_encoded'])
    ]
)

# Define the model
svm_model = LinearSVC(random_state=42)

# Create a pipeline that includes preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', svm_model)
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the testing set
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance
print(classification_report(y_test, y_pred))



               precision    recall  f1-score   support

      comment       1.00      1.00      1.00        52
        empty       0.98      0.96      0.97      1984
          key       0.99      0.98      0.98      2885
      key_1_x       0.94      1.00      0.97      1551
      key_1_y       0.97      0.97      0.97      1071
      key_2_x       1.00      1.00      1.00       273
      key_2_y       1.00      0.99      0.99       883
 key_diagonal       0.79      0.99      0.88       220
merged_helper       1.00      1.00      1.00      1489
        title       1.00      0.94      0.97      1950
        value       0.99      0.99      0.99      6869

     accuracy                           0.98     19227
    macro avg       0.97      0.98      0.97     19227
 weighted avg       0.98      0.98      0.98     19227



# Categorize Cells using Context Only

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# Load the dataset
file_path = '../T_cells_tagged.csv'  # Update this to your file path
data = pd.read_csv(file_path)

# Preprocess data
data.fillna('', inplace=True)  # Handling missing values
data['contextual_text'] = data['words_above'] + " " + data['words_left'] + " " + data['words_below'] + " " + data['words_right']

# Encode categorical data
label_encoder = LabelEncoder()
data['table_class_encoded'] = label_encoder.fit_transform(data['table_class'])

# Prepare features and target
X = data[['contextual_text', 'table_class_encoded']]
y = data['cell_role']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define TF-IDF Vectorization and preprocessing pipeline
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
preprocessor = ColumnTransformer(
    transformers=[
        ('text', tfidf_vectorizer, 'contextual_text'),
        ('num', StandardScaler(), ['table_class_encoded'])
    ]
)

# Define the model
svm_model = LinearSVC(random_state=42)

# Create a pipeline that includes preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', svm_model)
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the testing set
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance
print(classification_report(y_test, y_pred))




               precision    recall  f1-score   support

      comment       0.00      0.00      0.00        52
        empty       0.96      0.92      0.94      1984
          key       0.92      0.98      0.95      2885
      key_1_x       0.95      0.98      0.96      1551
      key_1_y       0.99      0.98      0.98      1071
      key_2_x       1.00      1.00      1.00       273
      key_2_y       0.94      1.00      0.97       883
 key_diagonal       1.00      1.00      1.00       220
merged_helper       0.77      1.00      0.87      1489
        title       0.94      0.62      0.75      1950
        value       0.97      0.98      0.97      6869

     accuracy                           0.94     19227
    macro avg       0.86      0.86      0.85     19227
 weighted avg       0.94      0.94      0.93     19227

