In [1]:
# Standard Python libraries
import os
import random

# Data handling libraries
import pandas as pd
import numpy as np
import sqlite3

# Machine learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample

# PyTorch and related libraries
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Transformers library for NLP tasks
from transformers import BertTokenizer, BertForSequenceClassification

# Visualization and utility libraries
from tabulate import tabulate
from tqdm import trange

# Deep learning libraries
import torchvision
import torchaudio


In [2]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
db_path = 'bwwt.db'  # Replace with the actual path to your SQLite database file
conn = sqlite3.connect(db_path)

# Define the SQL query to retrieve data from the table
query = 'SELECT * FROM FIELD_NOTES'

# Use pandas to read the query result into a DataFrame
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the DataFrame
df.head()


Unnamed: 0,FIELD_NOTES_ID,WELL_TEST_ID,COMMENT_CATEGORY,ODOUR_CAT,FIELD_NOTE_COMMENTS,CREATE_TIMESTAMP,CREATE_USERID,H2S_TESTED
0,1058147,3569,Landowner,1.0,Landowner indicated good water quality.,38:47.7,CONVERSION,
1,1058149,9782,Landowner,1.0,Landowner indicated good water quality.,38:47.7,CONVERSION,0.0
2,1058151,3570,Landowner,3.0,Water has sulfur taste and odour and is someti...,38:47.7,CONVERSION,
3,1058153,4649,Landowner,1.0,Landowner indicates poor water quality. Methan...,38:47.7,CONVERSION,1.0
4,1058155,840,Landowner,1.0,Landowner indicated good quality water with si...,38:47.7,CONVERSION,


# Cleanse 'FIELD_NOTE_COMMENTS' of form entries

Form entries in the format shown below and since they are in a difference language format with ü serving as a checkmark, should not be included.



'Is the water supply currently enough for your present requirements?   Yes  ü     No _____ 

'Is the water supply currently enough for your present requirements?   Yes  ü     No _____ 


TASTE:         Good   ü   Fair ___ Poor ___ Comments: __________________________________________

ODOUR:       Good   ü  Fair ___ Poor ___ Comments: __________________________________________

COLOUR:     Good   ü   Fair ___ Poor ___ Comments: ___________________________________________

SEDIMENT:  Good  ü    Fair ___ Poor ___ Comments: __________________________________________

GAS OBSERVED IN WATER:   Yes  __  No  ü'

In [3]:
# Create a DataFrame where 'FIELD_NOTE_COMMENTS' contains 'ü'
contains_u_form_df = df[df['FIELD_NOTE_COMMENTS'].str.contains('ü', case=False, na=False)]

# Create a DataFrame where 'FIELD_NOTE_COMMENTS' does not contain 'ü'
no_u_form_df = df[~df['FIELD_NOTE_COMMENTS'].str.contains('ü', case=False, na=False)]


# Drop nulls from Field Notes

In [4]:
df_field_notes_cleaned = no_u_form_df.dropna(subset=['FIELD_NOTE_COMMENTS'])

# Import Manually Labelled Field Notes 

In [5]:
# grab rows where field notes have been manually labelled 
df_labelled = no_u_form_df[pd.notna(no_u_form_df['ODOUR_CAT'])]

# Drop rows with 'ODOUR_CAT' values 0 and 1 (values where odour is not mentioned: 1, or there is a checklist: 0)

# 3 means there was an odour, and 2 means there was no odour, or the odour was decribed as good

df_labelled = df_labelled[~df_labelled['ODOUR_CAT'].isin([0, 1])]

# Replace values in 'ODOUR_CAT' column
df_labelled['ODOUR_CAT'] = df_labelled['ODOUR_CAT'].replace({3: 1, 2: 0})

df_labelled.shape

(1698, 8)

# Find comments that mention odour by searching for comments that contain a list of words related to smell

In [6]:
smell_words = ['odour', 'odor', 'smell', 'smells', 'sulphur', 'rotten egg', 'scent']

smell_words_df = df_field_notes_cleaned[df_field_notes_cleaned['FIELD_NOTE_COMMENTS'].str.contains('|'.join(smell_words), case=False)]

# Create a balanced df of labelled data

In [7]:

from sklearn.utils import resample

def random_undersample(df, target_column, minority_class_value, majority_class_value, undersample_ratio=1.0, random_seed=None):
    """
    Randomly undersample a binary categorical feature in a DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - target_column (str): The column to undersample.
    - minority_class_value: The value representing the minority class.
    - majority_class_value: The value representing the majority class.
    - undersample_ratio (float): The ratio of the number of minority class samples to keep.
    - random_seed (int or None): Seed for reproducibility.

    Returns:
    - pd.DataFrame: The undersampled DataFrame.
    """

    # Separate majority and minority classes
    majority_class = df[df[target_column] == majority_class_value]
    minority_class = df[df[target_column] == minority_class_value]

    # Undersample majority class
    undersampled_majority = resample(
        majority_class,
        replace=False,
        n_samples=int(len(minority_class) * undersample_ratio),
        random_state=random_seed
    )

    # Concatenate minority class and undersampled majority class
    undersampled_df = pd.concat([minority_class, undersampled_majority])

    return undersampled_df

In [8]:
undersampled_labelled=random_undersample(df_labelled, 'ODOUR_CAT', 1, 0, undersample_ratio=1.0, random_seed=9898)

In [9]:
text = undersampled_labelled.FIELD_NOTE_COMMENTS.values
labels = undersampled_labelled['ODOUR_CAT'].values

In [10]:
from transformers import BertTokenizer

# Download the tokenizer for 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)

# Save the tokenizer to a local directory (optional)
#tokenizer.save_pretrained('bert-base-uncased-tokenizer')

# You can now use the 'tokenizer' object to tokenize and encode text


# Print Random Sentence

In [11]:
import numpy as np
from numpy import random
from tabulate import tabulate
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒═══════════╤═════════════╕
│ Tokens    │   Token IDs │
╞═══════════╪═════════════╡
│ [UNK]     │         100 │
├───────────┼─────────────┤
│ indicated │        5393 │
├───────────┼─────────────┤
│ hard      │        2524 │
├───────────┼─────────────┤
│ water     │        2300 │
├───────────┼─────────────┤
│ with      │        2007 │
├───────────┼─────────────┤
│ iron      │        3707 │
├───────────┼─────────────┤
│ o         │        1051 │
├───────────┼─────────────┤
│ ##dou     │       26797 │
├───────────┼─────────────┤
│ ##r       │        2099 │
├───────────┼─────────────┤
│ .         │        1012 │
├───────────┼─────────────┤
│ [UNK]     │         100 │
├───────────┼─────────────┤
│ stain     │       21101 │
├───────────┼─────────────┤
│ ##ing     │        2075 │
├───────────┼─────────────┤
│ caused    │        3303 │
├───────────┼─────────────┤
│ by        │        2011 │
├───────────┼─────────────┤
│ water     │        2300 │
├───────────┼─────────────┤
│ .         │       

In [12]:
import torch

token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)

labels = torch.tensor(labels)
labels = labels.long()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    
    index = random.randint(0, len(text) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]

    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒═══════════╤═════════════╤══════════════════╕
│ Tokens    │   Token IDs │   Attention Mask │
╞═══════════╪═════════════╪══════════════════╡
│ [CLS]     │         101 │                1 │
├───────────┼─────────────┼──────────────────┤
│ [UNK]     │         100 │                1 │
├───────────┼─────────────┼──────────────────┤
│ water     │        2300 │                1 │
├───────────┼─────────────┼──────────────────┤
│ is        │        2003 │                1 │
├───────────┼─────────────┼──────────────────┤
│ not       │        2025 │                1 │
├───────────┼─────────────┼──────────────────┤
│ pumped    │       16486 │                1 │
├───────────┼─────────────┼──────────────────┤
│ for       │        2005 │                1 │
├───────────┼─────────────┼──────────────────┤
│ a         │        1037 │                1 │
├───────────┼─────────────┼──────────────────┤
│ long      │        2146 │                1 │
├───────────┼─────────────┼──────────────────┤
│ time      │

# Train and Validate Model

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler



val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [15]:
def b_tp(preds, labels):
    '''Returns True Positives (TP): count of correct predictions of actual class 1'''
    return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
    '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
    return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
    '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
    return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
    '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
    return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])


def b_metrics(preds, labels):
    '''Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)'''
    preds = np.argmax(preds, axis = 1).flatten()
    labels = labels.flatten()
    tp = b_tp(preds, labels)
    tn = b_tn(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity

In [16]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. GPU will be used.")
else:
    print("CUDA is not available. CPU will be used.")

CUDA is available. GPU will be used.


In [17]:

# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


Epoch:  50%|██████████████████████████████████████▌                                      | 1/2 [00:22<00:22, 22.13s/it]


	 - Train loss: 0.5502
	 - Validation Accuracy: 0.8646
	 - Validation Precision: 0.9209
	 - Validation Recall: 0.8062
	 - Validation Specificity: 0.9171



Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 2/2 [00:43<00:00, 21.76s/it]


	 - Train loss: 0.3420
	 - Validation Accuracy: 0.8542
	 - Validation Precision: 0.9727
	 - Validation Recall: 0.7356
	 - Validation Specificity: 0.9674






In [26]:
model.eval()  # Set the model to evaluation mode


# New sentence
new_sentence = "there was a funny odour in the sample"



# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim=0)
test_attention_mask = torch.cat(test_attention_mask, dim=0)

# Forward pass, calculate logit predictions
with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids=None, attention_mask=test_attention_mask.to(device))

# Assuming you have a LabelEncoder for converting model predictions to class labels
from sklearn.preprocessing import LabelEncoder

# Define the classes
class_labels = ["do_not_test", "test_for_H2S"]

# Create a LabelEncoder
label_encoder = LabelEncoder()

# Set the classes for the LabelEncoder
label_encoder.classes_ = class_labels


prediction = label_encoder.classes_[np.argmax(output.logits.cpu().numpy())]

print('Input Sentence:', new_sentence)
print('Predicted Class:', prediction)

Input Sentence: there was a funny odour in the sample
Predicted Class: test_for_H2S


# Make predictions using a pre-trained natural language processing (NLP) model. 

In [27]:
def predict_smell(text):
    model.eval()  # Set the model to evaluation mode


    # New sentence
    new_sentence = text

    # We need Token IDs and Attention Mask for inference on the new sentence
    test_ids = []
    test_attention_mask = []

    # Apply the tokenizer
    encoding = preprocessing(new_sentence, tokenizer)

    # Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim=0)
    test_attention_mask = torch.cat(test_attention_mask, dim=0)

    # Forward pass, calculate logit predictions
    with torch.no_grad():
        output = model(test_ids.to(device), token_type_ids=None, attention_mask=test_attention_mask.to(device))

    # Assuming you have a LabelEncoder for converting model predictions to class labels
    from sklearn.preprocessing import LabelEncoder

    # Define the classes
    class_labels = [0, 1]

    # Create a LabelEncoder
    label_encoder = LabelEncoder()

    # Set the classes for the LabelEncoder
    label_encoder.classes_ = class_labels


    prediction = label_encoder.classes_[np.argmax(output.logits.cpu().numpy())]


    return prediction

# Create a new column 'MODEL_PREDICTION' with the model predictions
smell_words_df['MODEL_PREDICTION'] = smell_words_df['FIELD_NOTE_COMMENTS'].apply(predict_smell)

# Display the updated DataFrame
smell_words_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smell_words_df['MODEL_PREDICTION'] = smell_words_df['FIELD_NOTE_COMMENTS'].apply(predict_smell)


Unnamed: 0,FIELD_NOTES_ID,WELL_TEST_ID,COMMENT_CATEGORY,ODOUR_CAT,FIELD_NOTE_COMMENTS,CREATE_TIMESTAMP,CREATE_USERID,H2S_TESTED,MODEL_PREDICTION
2,1058151,3570,Landowner,3.0,Water has sulfur taste and odour and is someti...,38:47.7,CONVERSION,,1
7,1058161,6812,Landowner,3.0,"Well used for domestic purposes, well complete...",38:47.7,CONVERSION,,1
8,1058163,3059,Landowner,2.0,Rust rings in house. Landowner indicated fair ...,38:47.7,CONVERSION,,0
10,1058167,6536,Landowner,3.0,Landowner indicated good water quality but exp...,38:47.7,CONVERSION,,1
11,1058169,3378,Landowner,3.0,Landowner indicated good taste until 2-3 weeks...,38:47.7,CONVERSION,,1


In [33]:
print(smell_words_df['FIELD_NOTE_COMMENTS'].iloc[2])
smell_words_df.iloc[2]

Rust rings in house. Landowner indicated fair water taste and odour, some sediment and good color.


FIELD_NOTES_ID                                                   1058163
WELL_TEST_ID                                                        3059
COMMENT_CATEGORY                                               Landowner
ODOUR_CAT                                                            2.0
FIELD_NOTE_COMMENTS    Rust rings in house. Landowner indicated fair ...
CREATE_TIMESTAMP                                                 38:47.7
CREATE_USERID                                                 CONVERSION
H2S_TESTED                                                           NaN
MODEL_PREDICTION                                                       0
Name: 8, dtype: object

In [37]:
mask = pd.isna(smell_words_df['ODOUR_CAT'])

# Use the mask to filter the DataFrame
unlabelled_rows = smell_words_df[mask]

print(unlabelled_rows['FIELD_NOTE_COMMENTS'].iloc[2])
unlabelled_rows.iloc[2]

Well used for domestic/stock purposes, well completed with a pitless adapter, 152mm well diameter, submersible pump /pressure tank system cannot be bypassed, no water treatment system, well chlorinated by owner in November 2005, slight sulphur smell, no gas present, water quality and pumping test completed when drilled, well has been in consistent use and pumped daily, no fuel stored on site, septic field located 75m East-Southeast of well.


FIELD_NOTES_ID                                                   1058261
WELL_TEST_ID                                                        7554
COMMENT_CATEGORY                                               Landowner
ODOUR_CAT                                                            NaN
FIELD_NOTE_COMMENTS    Well used for domestic/stock purposes, well co...
CREATE_TIMESTAMP                                                 38:47.7
CREATE_USERID                                                 CONVERSION
H2S_TESTED                                                           NaN
MODEL_PREDICTION                                                       1
Name: 57, dtype: object