In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
df = pd.read_csv('uber_review.csv')

In [None]:
# Truncate overly long reviews
max_input_length = 512  # Maximum input length for BERT
df['Comment'] = df['Comment'].apply(lambda x: x[:max_input_length])

# First, you need to convert the category labels into numerical format
label_encoder = LabelEncoder()
df['CategoryLabel'] = label_encoder.fit_transform(df['Category'])

# One-hot encode the labels
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_labels = one_hot_encoder.fit_transform(df['CategoryLabel'].values.reshape(-1, 1))

# Split the dataset into train and test set
train_text, test_text, train_labels, test_labels = train_test_split(
    df['Comment'], one_hot_labels, random_state=2018, test_size=0.3
)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(label_encoder.classes_)
)

# Tokenize the text
train_encodings = tokenizer(train_text.tolist(), truncation=True, padding=True, max_length=max_input_length)
test_encodings = tokenizer(test_text.tolist(), truncation=True, padding=True, max_length=max_input_length)

# Prepare the data inputs
train_inputs = {
    'input_ids': tf.convert_to_tensor(train_encodings['input_ids']),
    'attention_mask': tf.convert_to_tensor(train_encodings['attention_mask']),
    'labels': tf.convert_to_tensor(train_labels)
}
test_inputs = {
    'input_ids': tf.convert_to_tensor(test_encodings['input_ids']),
    'attention_mask': tf.convert_to_tensor(test_encodings['attention_mask']),
    'labels': tf.convert_to_tensor(test_labels)
}

In [None]:
# Compile and train the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit([train_inputs['input_ids'], train_inputs['attention_mask']], train_inputs['labels'], epochs=2, batch_size=16)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3dc2c45d50>

In [None]:
# Define the prediction and response generation functions
def predict_category(complaint):
    encoding = tokenizer(complaint, truncation=True, padding=True, max_length=max_input_length, return_tensors='tf')
    output = model(encoding)
    category_label = tf.argmax(output.logits, axis=-1).numpy()[0]
    return label_encoder.classes_[category_label]

def automated_response(complaint):
    category = predict_category(complaint)
    if category == 'driver_issues':
        return "We're sorry to hear about your experience. We take your safety seriously and will review your complaint as soon as possible."
    elif category == 'account_issues':
        return "We're sorry to hear about your account issues. Please change your password immediately and contact our security team."
    else:
        return "Thank you for your feedback. We will review your complaint and get back to you soon."

# Test the prediction and response generation
complaint = "I had a terrible experience with the driver..."
response = automated_response(complaint)
print(response)

We're sorry to hear about your experience. We take your safety seriously and will review your complaint as soon as possible.


In [None]:
predict_category(complaint)

'driver_issues'