In [6]:
# loading in modules

import pandas as pd 
from sudulunu.helpers import pp, make_num, dumper, rc
import sys 
import numpy as np 

### Data and loading
import pickle
from datasets import load_dataset, dataset_dict
import datasets

### Model creation
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Metrics and testing
import evaluate
accuracy = evaluate.load("accuracy")

import os 
import pathlib
pathos = pathlib.Path(os.path.abspath('sent_sim.ipynb')).parent
os.chdir(pathos)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# print(os.getcwd())


In [14]:
new_data = pd.read_csv('../analysis/training-ads.csv')
# ['ad_id', 'page_id', 'query', 'page_name', 'ad_creative_bodies', 'ad_delivery_start_time', 
# 'bylines', 'currency', 'spend_lower', 'spend_upper', 'impressions_lower', 'impressions_upper', 
# 'ad_snapshot_url', 'male', 'female', 'young', 'middle', 'old', 'delivery_by_region', 
# 'Australian Capital Territory', 'New South Wales', 'Northern Territory', 'Queensland', 
# 'South Australia', 'Tasmania', 'Victoria', 'Western Australia', 'ad_delivery_stop_time', 
# 'ad_creative_link_titles', 'ad_creative_link_captions', 'ad_creative_link_descriptions', 
# 'concat_text', 'cleaned_ad', 'voice_ad', 'side']

# pp(new_data)

In [15]:
# data = pd.read_excel("/Users/josh_nicholas/Repos/oz-2023-facebook-political-ads-scraper/process/inter/labelled.xlsx", sheet_name='Sheet1')
# 'Unnamed: 0', 'voice_ad', 'side', 'ad_id', 'page_id', 'query', 'page_name', 
# 'ad_creative_bodies', 'ad_creative_link_titles', 'ad_creative_link_captions', 
# 'ad_creative_link_descriptions', 'ad_snapshot_url'

data = new_data.copy()

# variable = 'ad_creative_bodies'
variable = 'concat_text'

records = []

# data = data[:2]

data = data[pd.to_numeric(data['voice_ad'], errors='coerce').notnull()]

data.dropna(subset=[variable], inplace=True)



for index, row in data.iterrows():
    record = {"label": int(row['voice_ad']), "text": row[variable]}
    records.append(record)

print(records)




In [16]:
def preprocess_function(examples):

    return tokenizer(examples['text'], truncation=True)


# cat = pd.DataFrame.from_records(records)

# train = datasets.Dataset.from_pandas(cat)
# print(train['text'])

# tokenized = train.map(preprocess_function, batched=True)

train = datasets.Dataset.from_list(records)

tokenized = train.map(preprocess_function, batched=True).train_test_split(test_size=0.1)


Map:   0%|          | 0/1990 [00:00<?, ? examples/s]

In [17]:

def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)


id2label = {0: "Not a Voice Ad", 1: "Is a Voice ad"}

label2id = {"Not a Voice Ad": 0, "Is a Voice ad": 1}



In [18]:
### Train model


model = AutoModelForSequenceClassification.from_pretrained(

    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id

)


training_args = TrainingArguments(

    output_dir="ad-classifier",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=5,

    weight_decay=0.01,

    evaluation_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True,

    push_to_hub=True,

)

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized["train"],

    eval_dataset=tokenized["test"],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

trainer.train()

trainer.push_to_hub()
# trainer.save_model("./classify_model/models/binary_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/Joshnicholas/ad-classifier into local empty directory.


Download file pytorch_model.bin:   0%|          | 7.88k/255M [00:00<?, ?B/s]

Download file runs/Aug03_14-19-51_39217/events.out.tfevents.1691036397.39217.60139.0: 100%|##########| 4.90k/4…

Download file runs/Aug03_14-38-25_39217/events.out.tfevents.1691037509.39217.60139.1: 100%|##########| 4.92k/4…

Clean file runs/Aug03_14-19-51_39217/events.out.tfevents.1691036397.39217.60139.0:  20%|##        | 1.00k/4.90…

Clean file runs/Aug03_14-38-25_39217/events.out.tfevents.1691037509.39217.60139.1:  20%|##        | 1.00k/4.92…

Download file training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Download file runs/Aug11_10-30-57_39217/events.out.tfevents.1691713945.39217.40525.0: 100%|##########| 4.93k/4…

Clean file training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Clean file runs/Aug11_10-30-57_39217/events.out.tfevents.1691713945.39217.40525.0:  20%|##        | 1.00k/4.93…

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]



  0%|          | 0/560 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.11979485303163528, 'eval_accuracy': 0.9748743718592965, 'eval_runtime': 2.5418, 'eval_samples_per_second': 78.292, 'eval_steps_per_second': 5.115, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.08738096803426743, 'eval_accuracy': 0.9698492462311558, 'eval_runtime': 1.6364, 'eval_samples_per_second': 121.607, 'eval_steps_per_second': 7.944, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.16382142901420593, 'eval_accuracy': 0.9597989949748744, 'eval_runtime': 1.7915, 'eval_samples_per_second': 111.081, 'eval_steps_per_second': 7.257, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.10023816674947739, 'eval_accuracy': 0.9798994974874372, 'eval_runtime': 1.8672, 'eval_samples_per_second': 106.576, 'eval_steps_per_second': 6.962, 'epoch': 4.0}
{'loss': 0.1595, 'learning_rate': 2.1428571428571427e-06, 'epoch': 4.46}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.09415065497159958, 'eval_accuracy': 0.9798994974874372, 'eval_runtime': 2.2049, 'eval_samples_per_second': 90.252, 'eval_steps_per_second': 5.896, 'epoch': 5.0}
{'train_runtime': 404.5721, 'train_samples_per_second': 22.134, 'train_steps_per_second': 1.384, 'train_loss': 0.1462593138217926, 'epoch': 5.0}


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/255M [00:00<?, ?B/s]

Upload file runs/Aug11_10-41-38_39217/events.out.tfevents.1691714588.39217.40525.1:   0%|          | 1.00/6.03…

To https://huggingface.co/Joshnicholas/ad-classifier
   bd22ca9..d882751  main -> main

To https://huggingface.co/Joshnicholas/ad-classifier
   d882751..d6a860d  main -> main



'https://huggingface.co/Joshnicholas/ad-classifier/commit/d8827513382b32f752a4af9413f473c55a686f28'

In [19]:
from transformers import pipeline

classifier = pipeline("text-classification", model="Joshnicholas/ad-classifier")


print(classifier(""""Launceston is not just a city; it is our home, our community, and the place we all share. As your potential Mayor, I firmly believe that every resident deserves to be heard, respected, and represented. I am committed to fostering an inclusive environment where everyone's opinions, ideas, and aspirations matter.

Your voice and your vote has the power to shape the future of our city. Let's do this together.

#timetovotecityoflaunceston"""))

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

[{'label': 'Not a Voice Ad', 'score': 0.9929136633872986}]
