In [None]:
!pip install transformers
!pip install tensorflow
!pip install scikit-learn


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf

import keras
from keras import layers

from transformers import DistilBertTokenizer, TFDistilBertModel
import matplotlib.pyplot as plt


In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/ input_data.csv')



In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
MAX_LEN = 400

tokens = tokenizer(
    list(df['Review']),
    padding='max_length',
    truncation=True,
    max_length=MAX_LEN,
    return_tensors='tf'
)
input_ids = tokens['input_ids'].numpy()
attention_mask = tokens['attention_mask'].numpy()

categorical_cols = ['Reviewer_Nationality', 'Hotel_Name']
encoders = {col: LabelEncoder().fit(df[col]) for col in categorical_cols}
X_categorical = np.stack([encoders[col].transform(df[col]) for col in categorical_cols], axis=1)

numerical_cols = ['Hotel_number_reviews', 'Reviewer_number_reviews']
scaler = StandardScaler()
X_numerical = scaler.fit_transform(df[numerical_cols])

y_class = (df['Review_Type'] == 'Good_review').astype(int)
y_reg = df['Review_Score']


In [None]:
(train_input_ids, val_input_ids,
 train_attention_mask, val_attention_mask,
 train_categorical, val_categorical,
 train_numerical, val_numerical,
 y_class_train, y_class_val,
 y_reg_train, y_reg_val) = train_test_split(
    input_ids, attention_mask, X_categorical, X_numerical,
    y_class, y_reg, test_size=0.2, random_state=42
)


In [None]:

text_input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
text_attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
numerical_input = layers.Input(shape=(X_numerical.shape[1],), dtype='float32', name='numerical_input')

categorical_inputs = []
categorical_embeddings = []
for i, col in enumerate(['Reviewer_Nationality', 'Hotel_Name']):
    num_classes = len(encoders[col].classes_)
    input_cat = layers.Input(shape=(1,), dtype='int32', name=f'{col}_input')
    embedding = layers.Embedding(input_dim=num_classes+1, output_dim=8)(input_cat)
    embedding = layers.Flatten()(embedding)
    categorical_inputs.append(input_cat)
    categorical_embeddings.append(embedding)

tabular_concat = layers.Concatenate()(categorical_embeddings + [numerical_input])
tabular_dense = layers.Dense(32, activation='relu')(tabular_concat)

bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
bert_output = layers.Lambda(
    lambda x: bert_model(x[0], attention_mask=x[1]).last_hidden_state[:, 0, :],
    output_shape=(768,), dtype=tf.float32
)([text_input_ids, text_attention_mask])

merged = layers.Concatenate()([bert_output, tabular_dense])
x = layers.Dense(64, activation='relu')(merged)
x = layers.Dropout(0.4)(x)
x = layers.Dense(32, activation='relu')(x)

output_class = layers.Dense(1, activation='sigmoid', name='classification_output')(x)
output_reg = layers.Dense(1, activation='linear', name='regression_output')(x)

model = keras.Model(
    inputs=[text_input_ids, text_attention_mask] + categorical_inputs + [numerical_input],
    outputs=[output_class, output_reg]
)

model.compile(
    optimizer="adam",
    loss={
        "classification_output": "binary_crossentropy",
        "regression_output": "mse"
    },
    loss_weights={
        "classification_output": 0.5,
        "regression_output": 0.5
    },
    metrics={
        "classification_output": "accuracy",
        "regression_output": "mse"
    }
)
model.summary()


In [None]:
history = model.fit(
    x={
        "input_ids": train_input_ids,
        "attention_mask": train_attention_mask,
        "Reviewer_Nationality_input": train_categorical[:, 0],
        "Hotel_Name_input": train_categorical[:, 1],
        "numerical_input": train_numerical
    },
    y={
        "classification_output": y_class_train,
        "regression_output": y_reg_train
    },
    validation_data=(
        {
            "input_ids": val_input_ids,
            "attention_mask": val_attention_mask,
            "Reviewer_Nationality_input": val_categorical[:, 0],
            "Hotel_Name_input": val_categorical[:, 1],
            "numerical_input": val_numerical
        },
        {
            "classification_output": y_class_val,
            "regression_output": y_reg_val
        }
    ),
    batch_size=16,
    epochs=5
)


In [None]:
plt.figure(figsize=(10,4))
plt.plot(history.history['classification_output_accuracy'], label='train acc')
plt.plot(history.history['val_classification_output_accuracy'], label='val acc')
plt.title('Classification Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(10,4))
plt.plot(history.history['regression_output_mse'], label='train mse')
plt.plot(history.history['val_regression_output_mse'], label='val mse')
plt.title('Regression MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
plt.show()

plt.figure(figsize=(10,4))
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.title('Total Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
