In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset


In [3]:
df = pd.read_csv('house-addresses.csv')

In [4]:
len(df)

100000

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Address             99999 non-null   object
 1   AddressWithCountry  100000 non-null  object
 2   Country             100000 non-null  object
dtypes: object(3)
memory usage: 2.3+ MB


In [6]:
missing_values = df.isnull().sum()

In [7]:
print(missing_values)

Address               1
AddressWithCountry    0
Country               0
dtype: int64


In [8]:
df.nunique()

Unnamed: 0,0
Address,97328
AddressWithCountry,97330
Country,10


In [9]:
df.dtypes

Unnamed: 0,0
Address,object
AddressWithCountry,object
Country,object


In [10]:
df.head()

Unnamed: 0,Address,AddressWithCountry,Country
0,"32, DUMOND STREET, UNIT 123, BENTLEY, WA, 6102","32, DUMOND STREET, UNIT 123, BENTLEY, WA, 6102...",AU
1,"26, ANDREW ROAD, UNIT 75, GREENBANK, QLD, 4124","26, ANDREW ROAD, UNIT 75, GREENBANK, QLD, 4124...",AU
2,"52, FERNSIDE AVENUE, BRIAR HILL, VIC, 3088","52, FERNSIDE AVENUE, BRIAR HILL, VIC, 3088, AU",AU
3,"44, SIGANTO DRIVE, HELENSVALE, QLD, 4212","44, SIGANTO DRIVE, HELENSVALE, QLD, 4212, AU",AU
4,"6, CORONATION STREET, BELLINGEN, NSW, 2454","6, CORONATION STREET, BELLINGEN, NSW, 2454, AU",AU


In [15]:
df.columns

Index(['Address', 'AddressWithCountry', 'Country'], dtype='object')

In [16]:
# Drop rows with NaN in relevant columns
df = df.dropna(subset=["Address", "AddressWithCountry", "Country"])



In [17]:
# Use AddressWithCountry as the input text
df["text"] = df["AddressWithCountry"]


In [18]:
# Normalize country names
df["Country"] = df["Country"].str.strip().str.lower()

# Map country names to numerical labels
unique_countries = df["Country"].unique()
country_to_label = {country: idx for idx, country in enumerate(unique_countries)}

# Apply mapping
df["label"] = df["Country"].map(country_to_label)


In [19]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text column
def tokenize_function(batch):
    return tokenizer(
        batch,
        padding="max_length",
        truncation=True,
        max_length=128,
    )

# Apply tokenization
tokenized = df["text"].apply(tokenize_function)
df["input_ids"] = tokenized.apply(lambda x: x["input_ids"])
df["attention_mask"] = tokenized.apply(lambda x: x["attention_mask"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [20]:
# Convert to PyTorch tensors
import torch

inputs = torch.tensor(list(df["input_ids"]))
masks = torch.tensor(list(df["attention_mask"]))
labels = torch.tensor(df["label"].values)


In [21]:
from torch.utils.data import Dataset

class AddressDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx],
        }

# Create the dataset
dataset = AddressDataset(inputs, masks, labels)


In [22]:
from torch.utils.data import random_split

# Define split sizes
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Perform the split
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [26]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load a pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(unique_countries))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=64,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start training
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0007,0.000381
2,0.0002,0.000252
3,0.0001,0.00023


TrainOutput(global_step=3750, training_loss=0.011379332572718462, metrics={'train_runtime': 4956.8932, 'train_samples_per_second': 48.417, 'train_steps_per_second': 0.757, 'total_flos': 1.5787599910774272e+16, 'train_loss': 0.011379332572718462, 'epoch': 3.0})

In [27]:
model.save_pretrained("country_classification_model")
tokenizer.save_pretrained("country_classification_model")


('country_classification_model/tokenizer_config.json',
 'country_classification_model/special_tokens_map.json',
 'country_classification_model/vocab.txt',
 'country_classification_model/added_tokens.json',
 'country_classification_model/tokenizer.json')

In [36]:
df['Country'].unique()

array(['au', 'be', 'br', 'ca', 'es', 'fr', 'jp', 'mx', 'us', 'za'],
      dtype=object)

In [37]:
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.preprocessing import LabelEncoder
import torch
import numpy as np
# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained("./country_classification_model/")
tokenizer = BertTokenizer.from_pretrained("./country_classification_model/")

# Example address input
address = "298 SMS, RUA SANTA TEREZINHA, Senador Rui Palmeira, AL, 57515-000, BR"

# Tokenize the address
inputs = tokenizer(address, padding=True, truncation=True, return_tensors="pt")

# Set the model to evaluation mode
model.eval()

# Disable gradients for inference
with torch.no_grad():
    # Perform inference
    outputs = model(**inputs)

    # Get predicted class (country)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Initialize the label encoder with saved class names
label_encoder = LabelEncoder()
label_encoder.classes_ =  np.array(['au', 'be', 'br', 'ca', 'es', 'fr', 'jp', 'mx', 'us', 'za']) # Example; use the classes from your training set

# Convert prediction back to country name
predicted_country = label_encoder.inverse_transform(predictions.cpu().numpy())

print(f"Predicted country: {predicted_country[0]}")


Predicted country: br


In [41]:
!zip -r /content/country_classification_model.zip  /content/country_classification_model

  adding: content/country_classification_model/ (stored 0%)
  adding: content/country_classification_model/vocab.txt (deflated 53%)
  adding: content/country_classification_model/tokenizer.json (deflated 71%)
  adding: content/country_classification_model/tokenizer_config.json (deflated 75%)
  adding: content/country_classification_model/model.safetensors (deflated 7%)
  adding: content/country_classification_model/special_tokens_map.json (deflated 42%)
  adding: content/country_classification_model/config.json (deflated 57%)
