### Notes
- probably not enough data to use word2vec

In [1]:
# Imports
import collections
from datetime import datetime as dt
import numpy as np
import pandas as pd
import string
from tqdm.auto import tqdm # For debugging

# Custom imports
from utils import accessors, preprocess, validation

In [2]:
np.random.seed(500)
show_outputs = True

In [3]:
# File and location information
path = "dataset/"
add_file = "addresses.jsonl"
cit_file = "cities.jsonl"

# Load the datasets in pandas
address_df = accessoars.load_data(path, add_file) 
cit_df = accessors.load_data(path, cit_file) 

# Preprocess the datasets (remove punctuation and parse to lowercase)
address_df = preprocess.clean_strings(address_df, 'address') 
address_df = address_df.drop_duplicates()
cit_df = preprocess.clean_strings(cit_df, 'city') 
cit_df = cit_df.drop_duplicates()

if show_outputs: 
    display(address_df)
    display(cit_df)

NameError: name 'accessoars' is not defined

### Quick data analytics

In [None]:
# Any nans in the dataset?
print(address_df.isna().sum())
print(cit_df.isna().sum())

In [None]:
# Number of occurences of each country 
countries = address_df.groupby('country').count()
countries.plot.barh()
countries.plot.barh(log = True)

# Simple solution 
Split address into tokens and brute force search in city dataframe 

In [None]:
t1 = dt.now()
add_df = address_df.copy(deep=True).sample(n = 10000)
# Split the address into a list
add_df['address_split'] = add_df['address'].str.split()

dupe_cities = collections.defaultdict(list)

# Loop over each row, split address into tokens, and see if tokens are in city dataframe
for i, row in tqdm(add_df.iterrows(), total=add_df.shape[0]): 
    address = row['address_split']
    for token in address: 
        if token.isnumeric(): 
            pass
        elif token in cit_df['city'].values:
            try: 
                add_df.loc[i, 'country_out'] = cit_df.loc[cit_df['city'] == token, 'country'].item()
            except: 
                dupe_cities[token].append(cit_df.loc[cit_df['city'] == token, 'country'].values)
            
display(add_df)

t2 = dt.now()
print("Time for calculation: {}".format((t2-t1)))

results = validation.accuracy(add_df, label_col = 'country', prediction_col = 'country_out')
display(results)

### Short-comings
- only really using the city information, and not any of the other address data
- not all cities are one single string 
- special characters might change (e.g. ß -> ss) so the city will be missed
- some cities exist in multiple countries
- slow and not scalable

# Simple ML solution 
Split address into tokens and brute force search in city dataframe 
https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [None]:
import pandas as pd
import numpy as np
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import sklearn 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

## Train test split 
- Train on 80% of the data and use the remaining 20% to validate the approach
- Stratify over country since they are not all equally occurring in the dataset

In [None]:
# Train test split
train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(address_df['address'],
                                                                            address_df['country'],
                                                                            stratify=address_df['country'],
                                                                            test_size=0.2)



In [None]:
# Combine the city dataset with the address dataset for training
train_x = pd.concat([train_x,cit_df['city']])
train_y = pd.concat([train_y,cit_df['country']])

## Feature Generation

In [None]:
# Initialise label encoder (assigns numerical value to country str)
Encoder = LabelEncoder()

# Encode the labels
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

if show_outputs: 
    print(train_y)
    print(test_y)

In [None]:
# Initialise term-frequency - inverse document frequency vectoriser
tfidf_vect = TfidfVectorizer(max_features=5000)

# Fit and transform on addresses
tfidf_vect.fit(address_df['address'])
train_x_tfidf = tfidf_vect.transform(train_x)
test_x_tfidf = tfidf_vect.transform(test_x)

if show_outputs: 
    print(train_x_tfidf)

## Naive Bayes model

In [None]:
# fit the training dataset on the NB classifier
Naive = sklearn.naive_bayes.MultinomialNB()
print("fitting...")
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
print("predictiong...")
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy: ", accuracy_score(predictions_NB, Test_Y))

### Notes on this approach 
- Very fast so scalable

## Support Vector Machine

In [None]:
# fit the training dataset on the classifier
SVM = sklearn.svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
print("fitting...")
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
print("predicting...")
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy: ",accuracy_score(predictions_SVM, Test_Y))

## Decision tree

In [None]:
tree = DecisionTreeClassifier()
print("fitting...")
tree.fit(Train_X_Tfidf, Train_Y)
# predict the labels on validation dataset
print("predictiong...")
predictions_tree = tree.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Decision Tree Accuracy: ", accuracy_score(predictions_tree, Test_Y))

### Notes on this approach 
- Takes a while to fit 
- SVMs do not perform well when target classes overlap 
- Not particularly scalable as it's slow to fit and to predict on large datasets
- The results are not significantly better than the simple Naive Bayes approach (92.83 NB vs 93.01 SVM)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = SGDClassifier()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [None]:
pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', ClfSwitcher()),])

parameters = [
    {
        'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
        'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        'tfidf__stop_words': ['english', None],
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log_loss', 'modified_huber'],
    },
    {
        'clf__estimator': [MultinomialNB()],
        'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        'tfidf__stop_words': ['english', None],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
]

gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3)
gscv.fit(train_x, train_y)

In [None]:
best_params = gscv.best_params_
print(best_params)
best_pipe = gscv.best_estimator_
print(best_pipe)

In [None]:
print('Training set score: ' + str(gscv.score(train_x, train_y)))
print('Test set score: ' + str(gscv.score(test_x, test_y)))

# print('Training set accuracy: ' + str(gscv.accuracy(train_x, train_y)))
# print('Test set accuracy: ' + str(gscv.accuracy(test_x, test_y)))