# Textkernel challenge 
Create an algorithm for identifying the country an address belongs to.


In [None]:
# Imports
import numpy as np
import pandas as pd
import sklearn
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Custom imports
import algorithms
from utils import accessors, preprocess, validation


## Setup, load data and preprocess

In [None]:
np.random.seed(500)
show_outputs = True
run_split_search = (
    True  # If false, just load the results instead of running the algorithm
)


In [None]:
# File and location information
path = "dataset/"
add_file = "addresses.jsonl"
cit_file = "cities.jsonl"

# Load the datasets in pandas
address_df = accessors.load_data(path, add_file)  # .sample(100)
cit_df = accessors.load_data(path, cit_file)

# Preprocess the datasets (remove punctuation and parse to lowercase)
address_df = preprocess.clean_strings(address_df, "address")
address_df = address_df.drop_duplicates()
cit_df = preprocess.clean_strings(cit_df, "city")
cit_df = cit_df.drop_duplicates()

if show_outputs:
    display(address_df)
    display(cit_df)


### Quick data analytics

In [None]:
# Any nans in the dataset?
print(address_df.isna().sum())
print(cit_df.isna().sum())


In [None]:
# Number of occurences of each country
countries = address_df.groupby("country").count()
countries.plot.barh()
countries.plot.barh(log=True)


# Simple search city df solution 
Split address into tokens and brute force search in city dataframe 

In [None]:
if run_split_search:
    # For now we will irngore this as it's slow
    t1 = dt.now()

    # Make a copy of address_df for split search
    add_df = address_df.copy(deep=True)

    # Initiate SplitSearch
    split_search = algorithms.SplitSearch(add_df, cit_df)

    # Run the algorithm and save results
    split_search.run()
    add_df_out = split_search.get_df()
    add_df_out.to_csv("outputs/split_search_results.csv")

    # Get the accuracy and save results
    accuracy = split_search.get_accuracy()
    accuracy.to_csv("outputs/split_search_accuracy.csv")

    # Get the cities that occur in multiple countries and save results
    dupe_cities = split_search.get_dupe_cities()
    np.save("outputs/dupe_cities.npy", dupe_cities)

    t2 = dt.now()
    print("Time for calculation: {}".format((t2 - t1)))


In [None]:
if not run_split_search:
    # Get the results from split search
    add_df_out = pd.read_csv("outputs/split_search_results.csv")

    # Get the accuracy and save results
    accuracy = pd.read_csv("outputs/split_search_accuracy.csv")

    # Get the cities that occur in multiple countries and save results
    dupe_cities = np.load("outputs/dupe_cities.npy", allow_pickle=True).item()


display(add_df_out)
display(accuracy)
print(dupe_cities)


### Short-comings
- only using the city information, and not any of the other address data
- not all cities are one single string 
- special characters might change (e.g. ß -> ss) so the city will be missed
- some cities exist in multiple countries
- slow and not scalable

# Simple ML solution 
- First split the data into training and test datasets
- Encode the label
- Build a pipeline of different models and sets of hyper parameters to grid search over (including encoding the address)
- Extract the best model

## Train test split 
- Train on 80% of the data and use the remaining 20% to validate the approach
- Stratify over country since they are not all equally occurring in the dataset

In [None]:
# make a copy of address_df for split search
add_df = address_df.copy(deep=True)

# Train test split
train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(
    add_df["address"],
    add_df["country"],
    stratify=add_df["country"],
    test_size=0.2,
)

# Combine the city dataset with the address dataset for training
train_x = pd.concat([train_x, cit_df["city"]])
train_y = pd.concat([train_y, cit_df["country"]])


### Label encoder

In [None]:
# Initialise label encoder (assigns numerical value to country str)
encoder = LabelEncoder()

# Encode the labels
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

np.save("outputs/encoded_classes.npy", encoder.classes_)

if show_outputs:
    print(train_y)
    print(test_y)


### Vectorizer, model and hyperparameter pipeline

In [None]:
# Set up sklearn pipline so we can grid search to find the best model and hyperparameters

pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", algorithms.ClfSwitcher()),
    ]
)

parameters = [
    {
        "clf__estimator": [SGDClassifier()],  # SVM if hinge loss / logreg if log loss
        "tfidf__max_df": (0.25, 0.5, 0.75, 1.0),
        "tfidf__stop_words": [None],
        "clf__estimator__penalty": ("l2", "elasticnet", "l1"),
        "clf__estimator__max_iter": [50, 80],
        "clf__estimator__tol": [1e-4],
        "clf__estimator__loss": ["hinge", "log_loss", "modified_huber"],
    },
    {
        "clf__estimator": [MultinomialNB()],
        "tfidf__max_df": (0.25, 0.5, 0.75, 1.0),
        "tfidf__stop_words": [None],
        "clf__estimator__alpha": (1e-2, 1e-3, 1e-1),
    },
]

gscv = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3
)
gscv.fit(train_x, train_y)


In [None]:
best_params = gscv.best_params_
print(best_params)
best_pipe = gscv.best_estimator_
print(best_pipe)


In [None]:
print("Training set score: " + str(gscv.score(train_x, train_y)))
print("Test set score: " + str(gscv.score(test_x, test_y)))


In [None]:
# Save the model to disk
filename = "outputs/finalized_model.sav"
pickle.dump(best_pipe, open(filename, "wb"))


In [None]:
pred_y = encoder.inverse_transform(best_pipe.predict(test_x))
enc_test_y = encoder.inverse_transform(test_y)

df = pd.DataFrame(
    np.array([test_x, enc_test_y, pred_y]).T,
    columns=["address", "country", "country_pred"],
)

accuracy_df = validation.class_accuracy(df, "country", "country_pred")
display(accuracy_df)


## Look at the incorrectly labeled addresses

In [None]:
falsely_predicted = df[~df["correct"]]

# Display the incorrectly labeled countries and the counts of what labels they were assigned
display(
    falsely_predicted.groupby(["country", "country_pred"])
    .agg({"correct": "count"})
    .rename(columns={"correct": "count"})
)


## Notes
- You can see that it's picking up the language. E.g. most of the BE addresses were labelled DE or FR (with one DE and one LU)

# General notes on this approach 
- NB is very fast so ver scalable
- SVM takes a while to fit and predict on large datasets
- SVMs do not perform well when target classes overlap 
- The results are not significantly better than the simple Naive Bayes approach (92.83 NB vs 93.01 SVM)