### Notes
- probably not enough data to use word2vec
- need to add summary

In [None]:
# Imports
import numpy as np
import pandas as pd
import sklearn
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

import algorithms

# Custom imports
from utils import accessors, preprocess, validation


## Setup, load data and preprocess

In [None]:
np.random.seed(500)
show_outputs = True


In [None]:
# File and location information
path = "dataset/"
add_file = "addresses.jsonl"
cit_file = "cities.jsonl"

# Load the datasets in pandas
address_df = accessors.load_data(path, add_file)
cit_df = accessors.load_data(path, cit_file)

# Preprocess the datasets (remove punctuation and parse to lowercase)
address_df = preprocess.clean_strings(address_df, "address")
address_df = address_df.drop_duplicates()
cit_df = preprocess.clean_strings(cit_df, "city")
cit_df = cit_df.drop_duplicates()

if show_outputs:
    display(address_df)
    display(cit_df)


### Quick data analytics

In [None]:
# Any nans in the dataset?
print(address_df.isna().sum())
print(cit_df.isna().sum())


In [None]:
# Number of occurences of each country
countries = address_df.groupby("country").count()
countries.plot.barh()
countries.plot.barh(log=True)


# Simple search city df solution 
Split address into tokens and brute force search in city dataframe 

In [None]:
t1 = dt.now()
# Initiate SplitSearch
split_search = algorithms.SplitSearch(address_df, cit_df)
split_search.run()
add_df_out = split_search.get_df()  # TODO: Save this

# Get the accuracy
accuracy = split_search.get_accuracy()  # TODO: Save this
display(accuracy)

# Get the cities that occur in multiple counries
dupe_cities = split_search.get_dupe_cities()
print(dupe_cities)

t2 = dt.now()
print("Time for calculation: {}".format((t2 - t1)))


### Short-comings
- only using the city information, and not any of the other address data
- not all cities are one single string 
- special characters might change (e.g. ß -> ss) so the city will be missed
- some cities exist in multiple countries
- slow and not scalable

# Simple ML solution 
- First split the data into training and test datasets
- Encode the label
- Build a pipeline of different models and sets of hyper parameters to grid search over (including encoding the address)
- Extract the best model

## Train test split 
- Train on 80% of the data and use the remaining 20% to validate the approach
- Stratify over country since they are not all equally occurring in the dataset

In [None]:
# Train test split
train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(
    address_df["address"],
    address_df["country"],
    stratify=address_df["country"],
    test_size=0.2,
)

# Combine the city dataset with the address dataset for training
train_x = pd.concat([train_x, cit_df["city"]])
train_y = pd.concat([train_y, cit_df["country"]])


### Label encoder

In [None]:
# Initialise label encoder (assigns numerical value to country str)
Encoder = LabelEncoder()

# Encode the labels
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

if show_outputs:
    print(train_y)
    print(test_y)


### Vectorizer, model and hyperparameter pipeline

In [None]:
# Set up sklearn pipline so we can grid search to find the best model and hyperparameters

pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", algorithms.ClfSwitcher()),
    ]
)

parameters = [
    {
        "clf__estimator": [SGDClassifier()],  # SVM if hinge loss / logreg if log loss
        "tfidf__max_df": (0.25, 0.5, 0.75, 1.0),
        "tfidf__stop_words": ["english", None],
        "clf__estimator__penalty": ("l2", "elasticnet", "l1"),
        "clf__estimator__max_iter": [50, 80],
        "clf__estimator__tol": [1e-4],
        "clf__estimator__loss": ["hinge", "log_loss", "modified_huber"],
    },
    {
        "clf__estimator": [MultinomialNB()],
        "tfidf__max_df": (0.25, 0.5, 0.75, 1.0),
        "tfidf__stop_words": ["english", None],
        "clf__estimator__alpha": (1e-2, 1e-3, 1e-1),
    },
]

gscv = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3
)
gscv.fit(train_x, train_y)


In [None]:
best_params = gscv.best_params_
print(best_params)
best_pipe = gscv.best_estimator_
print(best_pipe)

print("Training set score: " + str(gscv.score(train_x, train_y)))
print("Test set score: " + str(gscv.score(test_x, test_y)))


In [None]:
# Now build the classifier with the best model, save the model and then we can easily take a new address and output a country


### Notes on this approach 
- NB is very fast so ver scalable
- SVM takes a while to fit and predict on large datasets
- SVMs do not perform well when target classes overlap 
- The results are not significantly better than the simple Naive Bayes approach (92.83 NB vs 93.01 SVM)