# Imports

In [162]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from datetime import datetime





# Read and preprocess data

In [163]:

df = pd.read_csv('data/reviews_labeled.csv')
print(df.columns)
print(df.dtypes)
print(df.head())

# Convert labels to numbers
convert_dir = {'ADVERTISEMENT': 0, 'GOOD':1, 'IRRELEVANT':2, 'RANTS':3  }
df['num_label'] = [convert_dir[label] for label in df['label']]
print(df.head())

# normalize tabular features
scaler = StandardScaler()
df[["rating"]] = scaler.fit_transform(df[["rating"]])
print(df.head()["rating"])




x = df[['text', 'rating']]
y = df['num_label']

# Split into train (80%) and test (20%)
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class balance
)


Index(['business_name', 'author_name', 'text', 'photo', 'rating',
       'rating_category', 'label'],
      dtype='object')
business_name      object
author_name        object
text               object
photo              object
rating              int64
rating_category    object
label              object
dtype: object
                     business_name    author_name  \
0  Haci'nin Yeri - Yigit Lokantasi    Gulsum Akar   
1  Haci'nin Yeri - Yigit Lokantasi  Oguzhan Cetin   
2  Haci'nin Yeri - Yigit Lokantasi     Yasin Kuyu   
3  Haci'nin Yeri - Yigit Lokantasi     Orhan Kapu   
4  Haci'nin Yeri - Yigit Lokantasi     Ozgur Sati   

                                                text  \
0  We went to Marmaris with my wife for a holiday...   
1  During my holiday in Marmaris we ate here to f...   
2  Prices are very affordable. The menu in the ph...   
3  Turkey's cheapest artisan restaurant and its f...   
4  I don't know what you will look for in terms o...   

                        

# Feature Extraction

In [164]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


In [165]:
# function to extract features
def feature_extraction(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    vector_cls = outputs.last_hidden_state[:, 0, :]  # shape: (1, 768)
    return vector_cls
with torch.no_grad():
    embeddings_train = torch.cat([feature_extraction(t) for t in x_train['text'].values])
    embeddings_test = torch.cat([feature_extraction(t) for t in x_test['text'].values])

In [166]:
# Example dataset
num_samples = 1100
text_dim = 768
tabular_dim = 1
hidden_dim=128
num_classes=4
report_interval = 10
num_epochs = 5
batch_size=4


# create datatensors

#train
tabular_features_train = torch.tensor(x_train[['rating']].values, dtype=torch.float32,requires_grad=False)
labels_train = torch.tensor(y_train.values, dtype=torch.int64,requires_grad=False)
#test
tabular_features_test = torch.tensor(x_test[['rating']].values, dtype=torch.float32,requires_grad=False)
labels_test = torch.tensor(y_test.values, dtype=torch.int64,requires_grad=False)



# Dataset + Dataloader
dataset_train = TensorDataset(embeddings_train, tabular_features_train , labels_train)
loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)

dataset_test = TensorDataset(embeddings_test, tabular_features_test , labels_test)
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)


In [167]:


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

class HybridClassifier(nn.Module):
    def __init__(self, text_dim,tabular_dim , hidden_dim, num_classes):
        super().__init__()
        # Dense layers
        self.fc1 = nn.Linear(text_dim + tabular_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, text_vec,tabular_vec):
        """
        text_vec: [batch_size, text_dim]   (e.g., DistilBERT CLS embeddings)
        tabular_vec: [batch_size, tabular_dim]
        """
        # Concatenate text and tabular features
        x = torch.cat([text_vec, tabular_vec], dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)  # logits
        return x





# Example setup
model = HybridClassifier(text_dim=text_dim, tabular_dim=tabular_dim,hidden_dim=hidden_dim , num_classes=num_classes)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
num_epochs = 5

# Check forward pass
logits = model(embeddings_test[0:1],tabular_features_test[0:1])
pred_probab = nn.Softmax(dim=1)(logits)
pred_class = torch.argmax(pred_probab, dim=1)
print(logits)
print(pred_probab)
print(pred_class)

# Check loss function
print(loss_fn(logits,labels_test [0:1]))






Using cpu device
HybridClassifier(
  (fc1): Linear(in_features=769, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=4, bias=True)
)
tensor([[ 0.1006, -0.0747,  0.1625, -0.2261]], grad_fn=<AddmmBackward0>)
tensor([[0.2759, 0.2316, 0.2935, 0.1990]], grad_fn=<SoftmaxBackward0>)
tensor([2])
tensor(1.4630, grad_fn=<NllLossBackward0>)


In [168]:
def train_one_epoch(epoch_index,report_interval,):# tb_writer):
    running_loss = 0.
    last_loss = 0.
    
    for i, data in enumerate(loader_train):
        # Forward pass
        text_vec, tab_vec, labels = data
        
        # should not require grads
        #print("text_vec.requires_grad:", text_vec.requires_grad)
        #print("tab_vec.requires_grad:", tab_vec.requires_grad)
        #print("labels.requires_grad:", getattr(labels, "requires_grad", None))

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(text_vec, tab_vec)
        
         # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

         # Gather data and report
        running_loss += loss.item()
        if i % report_interval == report_interval - 1:
            last_loss = running_loss / report_interval # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            #tb_x = epoch_index * len(loader_train) + i + 1
            #tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

        

    return last_loss





In [169]:

# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0



best_vloss = 1_000_000.

for epoch in range(num_epochs):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number,report_interval) #, writer)


    running_loss_test = 0.
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, data_test in enumerate(loader_test):
            text_vec_test, tab_vec_test, labels_test= data_test
            outputs_test= model(text_vec_test, tab_vec_test)
            loss_test = loss_fn(outputs_test, labels_test)
            running_loss_test += loss_test

    avg_vloss = running_loss_test / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    #writer.add_scalars('Training vs. Validation Loss',
    #                { 'Training' : avg_loss, 'Validation' : avg_vloss },
    #                epoch_number + 1)
    #.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
  batch 10 loss: 0.8231233574450016
  batch 20 loss: 0.9956197910010814
  batch 30 loss: 0.9979626268148423
  batch 40 loss: 0.8393666684627533
  batch 50 loss: 0.449052993953228
  batch 60 loss: 0.6304006032645703
  batch 70 loss: 0.876692845672369
  batch 80 loss: 0.7952616006135941
  batch 90 loss: 0.9537592858076096
  batch 100 loss: 0.6762700498104095
  batch 110 loss: 0.6033520549535751
  batch 120 loss: 0.8241837285459042
  batch 130 loss: 0.5989247411489487
  batch 140 loss: 0.789803110063076
  batch 150 loss: 0.8693769961595536
  batch 160 loss: 0.702276173233986
  batch 170 loss: 0.8381043821573257
  batch 180 loss: 0.5716994315385818
  batch 190 loss: 0.5903483480215073
  batch 200 loss: 0.6022703230381012
  batch 210 loss: 0.7763756096363068
  batch 220 loss: 0.827682389318943
LOSS train 0.827682389318943 valid 0.6271569728851318
EPOCH 2:
  batch 10 loss: 0.5029956787824631
  batch 20 loss: 0.7142424449324608
  batch 30 loss: 0.6112801462411881
  batch 40 loss: 0.5