# Imports

In [147]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from torch.utils.data import WeightedRandomSampler
from torch.utils.data import RandomSampler








# Config

# Defs and setup

In [148]:
# feature specification for the model
# author_name: string
# number_authors_comments: integer , if unknown then -1
# rating: integer (1-5), if unknonw then -1
# text: string 
# general_location_type: string, one of general_location_type_categories
# TODO specific_location_type: !!!UNUSED!!! string, if unknown then "Unknown"
# time: timestamp, if unknown then -1
# photo_attached: string, one of photo_attached_categories
# TODO responses:  !!!UNUSED!!! string
# number_of_responses: integer, if unknown then -1
model_features = ['author_name' ,'number_authors_comments', 'rating', 'text','general_location_type', 'specific_location_type', 'time', 'photo_attached', 'responses','number_of_responses']
general_location_type_categories= [
    "Automotive",
    "Business",
    "Culture",
    "Education",
    "Entertainment and Recreation",
    "Facilities",
    "Finance",
    "Food and Drink",
    "Geographical Areas",
    "Government",
    "Health and Wellness",
    "Housing",
    "Lodging",
    "Natural Features",
    "Places of Worship",
    "Services",
    "Shopping",
    "Sports",
    "Transportation",
    "Unknown"
]
photo_attached_categories = [
    "No",
    "Yes", 
    "Unknown"
]

def normalizer_model_features(df_in):
    if(set(model_features) != set(df_in.columns)):
        raise ValueError("Input dataframe does not have the correct columns")
    df_out = df_in.copy()
    scaler = StandardScaler()
    df_out["number_authors_comments"] = scaler.fit_transform(df_in[["number_authors_comments"]])
    df_out["rating"] = scaler.fit_transform(df_in[["rating"]])
    df_out["time"] = scaler.fit_transform(df_in[["time"]])
    df_out["number_of_responses"] = scaler.fit_transform(df_in[["number_of_responses"]])
    return df_out

def encoder_model_features(df_in):
    if(set(model_features) != set(df_in.columns)):
        raise ValueError("Input dataframe does not have the correct columns")
    dummies = pd.get_dummies(df_in[['general_location_type', 'specific_location_type', 'photo_attached']], dtype=int)
    df_in.drop(columns=['general_location_type', 'specific_location_type', 'photo_attached'], inplace=True)
    df_out = pd.concat([df_in, dummies], axis=1)
    return df_out
    

    


# function to preprocess the dataframe to the input format required by the model
# takes a dataframe with columns as in input_format
def preprocess_features_scraped_data(df_in):
    input_format = ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id','label']

    if not (set(input_format).issubset(set(df_in.columns))):
        raise ValueError("Input dataframe does not have the correct columns")
    df_out = pd.DataFrame(columns=model_features) 
    df_out['author_name'] = df_in['name'].fillna("").astype(str)
    df_out['number_authors_comments'] = -1
    df_out['rating'] = df_in['rating'].fillna(-1).astype(int)
    df_out['text'] = df_in['text'].fillna("").astype(str)
    df_out['general_location_type'] = "Unknown"
    df_out['specific_location_type'] = "Unknown"
    df_out['time'] = df_in['time'].fillna(-1).astype(int)
    df_out['photo_attached'] = df_in['pics'].apply(lambda x: "No" if pd.isna(x) else "Yes").astype(str)
    df_out['number_of_responses'] = df_in['resp'].apply(lambda x: 0 if pd.isna(x) else 1 ) # TODO: currently only one rsponse if multiple, correct in future 
    df_out['responses'] = df_in['resp'].fillna("").astype(str)
    return df_out

# function to preprocess the dataframe to the label format required by the model
# 0 if still_exists is 'exists' else 1
def preprocess_labels_scraped_data(df_in):
    df_out = pd.DataFrame(columns=['label']) 
    df_out['label'] = df_in['still_exists'].apply(lambda x: 0 if x=='exists' else 1).astype(int)
    return df_out


def preprocess_features_1100(df_in):
    # Expected input columns
    input_format = ['business_name', 'author_name', 'text', 'photo', 'rating', 'rating_category', 'label']

    if not set(input_format).issubset(df_in.columns):
        raise ValueError("Input dataframe does not have the correct columns")

    # Create output DataFrame with the exact column order
    df_out = pd.DataFrame(columns=model_features)

    # Fill columns
    df_out['author_name'] = df_in['author_name'].fillna("").astype(str)
    df_out['number_authors_comments'] = -1  # placeholder
    df_out['rating'] = df_in['rating'].fillna(-1).astype(int)
    df_out['text'] = df_in['text'].fillna("").astype(str)
    df_out['general_location_type'] = "Unknown"
    df_out['specific_location_type'] = "Unknown"
    df_out['time'] = -1  # placeholder if not in input
    df_out['photo_attached'] = "Yes"  # always present
    df_out['responses'] = ""  # placeholder
    df_out['number_of_responses'] = 0  # placeholder

    return df_out


# Preprocess labels using the mapping dictionary
def preprocess_labels_1100(df_in):
    convert_dir = {'GOOD': 0, 'ADVERTISEMENT': 1, 'IRRELEVANT': 2, 'RANTS': 3}
    df_out = pd.DataFrame(columns=['label'])
    df_out['label'] = df_in['label'].map(convert_dir).astype(int)
    return df_out


# feature extraction model setup
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
# function to extract features
def feature_extraction(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    vector_cls = outputs.last_hidden_state[:, 0, :]  # shape: (1, 768)
    return vector_cls
def feature_extraction_batch(texts):
    with torch.no_grad():
        return torch.cat([feature_extraction(t) for t in texts])
    


# loaders
def create_tensor_dataset(features, labels):
    tabular_features_tensor = torch.tensor(features, dtype=torch.float32,requires_grad=False)
    labels_tensor = torch.tensor(labels, dtype=torch.int64,requires_grad=False)
    return tabular_features_tensor, labels_tensor


def reverse_weighted_dataloader(features, labels, batch_size):
    class_counts_train = np.bincount(labels) # Count occurrences of integer to max integer
    weights_train = 1 / class_counts_train # Inverse frequency
    sample_weights_train = [weights_train[label] for label in labels] # Assign weight to each sample

    sampler = WeightedRandomSampler(sample_weights_train, num_samples=len(sample_weights_train), replacement=True)
    dataset_train = TensorDataset(features , labels)
    loader_train = DataLoader(dataset_train, batch_size=batch_size, sampler=sampler)
    return loader_train

def shuffled_dataloader(features, labels, batch_size):
    dataset_test = TensorDataset(features , labels)
    loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)
    return loader_test

# model definition
class HybridClassifier(nn.Module):
    def __init__(self, feature_dim , hidden_dim, num_classes):
        super().__init__()
        # Dense layers
        self.fc1 = nn.Linear(feature_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, feature_vec):
        """
        text_vec: [batch_size, text_dim]   (e.g., DistilBERT CLS embeddings)
        tabular_vec: [batch_size, tabular_dim]
        """
        # Concatenate text and tabular features
        x = feature_vec
        x = F.relu(self.fc1(x))
        x = self.fc2(x)  # logits
        return x
    

def train_one_epoch(loader_train, optimizer, model, loss_fn, epoch_index,report_interval,):# tb_writer):
    running_loss = 0.
    last_loss = 0.
    
    for i, data in enumerate(loader_train):
        # Forward pass
        feature_vec, labels = data
        
        # should not require grads
        #print("text_vec.requires_grad:", text_vec.requires_grad)
        #print("tab_vec.requires_grad:", tab_vec.requires_grad)
        #print("labels.requires_grad:", getattr(labels, "requires_grad", None))

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(feature_vec)
        
         # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

         # Gather data and report
        running_loss += loss.item()
        if i % report_interval == report_interval - 1:
            last_loss = running_loss / report_interval # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            #tb_x = epoch_index * len(loader_train) + i + 1
            #tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [149]:

#df = pd.read_csv('data/1100_reviews_labeled_dummy_by_chatgpt.csv')


# Convert labels to numbers
#convert_dir = {'GOOD':0, 'ADVERTISEMENT': 1,  'IRRELEVANT':2, 'RANTS':3  }
#df['num_label'] = [convert_dir[label] for label in df['label']]
#print(df.head())


# normalize tabular features
#scaler = StandardScaler()
#df[["rating"]] = scaler.fit_transform(df[["rating"]])
#print(df.head()["rating"])





In [None]:

# Example dataset
num_samples = 1100
text_dim = 768
hidden_dim=128
num_classes=4
report_interval = 10
num_epochs = 10
batch_size=4

df = pd.read_csv('review_other_head_labeled.csv')



#features = preprocess_features_1100(df)
#labels = preprocess_labels_1100(df)

features = preprocess_features_scraped_data(df)
labels = preprocess_labels_1100(df)
features = normalizer_model_features(features)
features = encoder_model_features(features)




# Split into train (80%) and test (20%)
x_train, x_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels  # stratify keeps class balance
)

drop = ['author_name' , 'text', 'responses',]
x_train_tab = x_train.drop(columns=drop )
x_test_tab = x_test.drop(columns=drop)

print(x_train_tab.head())
print(y_train.head())





      number_authors_comments    rating      time  number_of_responses  \
4973                      0.0 -3.512818  0.849650             1.013288   
232                       0.0  0.324440 -0.453932             1.013288   
4202                      0.0  0.324440  1.150455             1.013288   
561                       0.0  0.324440  1.194277            -0.986886   
3551                      0.0  0.324440  0.435221            -0.986886   

      general_location_type_Unknown  specific_location_type_Unknown  \
4973                              1                               1   
232                               1                               1   
4202                              1                               1   
561                               1                               1   
3551                              1                               1   

      photo_attached_No  photo_attached_Yes  
4973                  1                   0  
232                   1             

In [151]:
x_train_embeddings = feature_extraction_batch(x_train['text'])
x_test_embeddings = feature_extraction_batch(x_test['text']) 



In [153]:

feature_tensor_train_tab, label_tensor_test = create_tensor_dataset(x_train_tab.values, y_train['label'].values)
feature_tensor_train = torch.cat((x_train_embeddings, feature_tensor_train_tab), dim=1)
loader_train = reverse_weighted_dataloader(feature_tensor_train, label_tensor_test, batch_size)

feature_tensor_test_tab, label_tensor_test = create_tensor_dataset(x_test_tab.values, y_test['label'].values)
feature_tensor_test = torch.cat((x_test_embeddings, feature_tensor_test_tab), dim=1)
loader_test = shuffled_dataloader(feature_tensor_test, label_tensor_test, batch_size)


# Example setup
model = HybridClassifier(feature_dim=feature_tensor_train.shape[1],hidden_dim=hidden_dim , num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

print(f"""
Shapes:
- feature_tensor_train : {feature_tensor_train.shape}
- x_train_embeddings   : {x_train_embeddings.shape}
- x_train_tab          : {x_train_tab.shape}
""")



Shapes:
- feature_tensor_train : torch.Size([4000, 776])
- x_train_embeddings   : torch.Size([4000, 768])
- x_train_tab          : (4000, 8)



In [154]:

# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0



best_vloss = 1_000_000.

for epoch in range(num_epochs):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(loader_train, optimizer, model, loss_fn, epoch_number,report_interval) #, writer)


    running_loss_test = 0.
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, data_test in enumerate(loader_test):
            feature_vec_test, labels_vec_test= data_test
            outputs_test= model(feature_vec_test)
            loss_test = loss_fn(outputs_test, labels_vec_test)
            running_loss_test += loss_test

    avg_vloss = running_loss_test / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    #writer.add_scalars('Training vs. Validation Loss',
    #                { 'Training' : avg_loss, 'Validation' : avg_vloss },
    #                epoch_number + 1)
    #.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
  batch 10 loss: 1.4986106276512146
  batch 20 loss: 1.3550705194473267
  batch 30 loss: 1.2688472986221313
  batch 40 loss: 1.3258498072624207
  batch 50 loss: 1.2192080080509187
  batch 60 loss: 1.39317786693573
  batch 70 loss: 1.1766167998313903
  batch 80 loss: 1.2333898007869721
  batch 90 loss: 1.0189742386341094
  batch 100 loss: 1.178773248195648
  batch 110 loss: 1.0297385454177856
  batch 120 loss: 1.2078661799430848
  batch 130 loss: 1.003592187166214
  batch 140 loss: 0.9647981286048889
  batch 150 loss: 1.0407690465450288
  batch 160 loss: 0.9078296780586242
  batch 170 loss: 0.8416253954172135
  batch 180 loss: 0.9307743191719056
  batch 190 loss: 0.9137235343456268
  batch 200 loss: 0.9119169443845749
  batch 210 loss: 0.864942866563797
  batch 220 loss: 0.7623353630304337
  batch 230 loss: 0.7790306866168976
  batch 240 loss: 0.9792093276977539
  batch 250 loss: 0.8960968926548958
  batch 260 loss: 0.7324938952922821
  batch 270 loss: 0.6575170755386353
  batc

# Evalution

In [155]:
PATH = "model_20250828_220516_5"
loaded_model = model = HybridClassifier(feature_dim=feature_tensor_train.shape[1],hidden_dim=hidden_dim , num_classes=num_classes)
loaded_model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [156]:
with torch.no_grad():   
        y_test_pred= model(feature_tensor_test)
        pred_probab = nn.Softmax(dim=1)(y_test_pred)
        pred_class = torch.argmax(pred_probab, dim=1)
        

precision = precision_score(label_tensor_test, pred_class, average=None)
accuracy = accuracy_score(label_tensor_test, pred_class)
recall = recall_score(label_tensor_test, pred_class, average=None)  # Sensitivity

# Specificity
conf_mat = confusion_matrix(label_tensor_test, pred_class, labels=[0,1,2,3])
print(conf_mat)
#specificity = tn / (tn + fp)

print(f"Precision: {precision}")
print(f"Sensitivity (Recall): {recall}")
print(f"Accuracy: {accuracy}")
#print(f"Specificity: {specificity:.3f}")

            

[[942  15   7   3]
 [  8   5   1   0]
 [  5   0  10   0]
 [  4   0   0   0]]
Precision: [0.9822732  0.25       0.55555556 0.        ]
Sensitivity (Recall): [0.97414685 0.35714286 0.66666667 0.        ]
Accuracy: 0.957
