In [None]:
import pandas as pd
import numpy as np
import os
import re
#import spacy
from transformers import BertTokenizer, BertModel 
import torch 
from typing import  Tuple
from sklearn import pipeline, svm
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.metrics import  ConfusionMatrixDisplay , precision_score , recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# '/Users/geraldoflanagain/Downloads'

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
#train = pd.read_csv('/Users/geraldoflanagain/Downloads/train.csv')

In [None]:
train[0:10]

What does the location field look like and does it need cleaning?

In [None]:
train.groupby("location").id.nunique().sort_values(ascending=True).head(50)

Looks like there are some non location strings.

In [None]:
train.groupby('target')['location'].nunique()

Initial Steps:

1/ use spacy to apply NER to the location field and remove non-locations. 

2/ second step of EDA on the key word field but the initial thoughts would be to train a key word extraction model on the records with keywords and apply it to the blanks.

3/ use scikit learn or spacy  to create a pipeline with feature union to concat the tags, location and text vectors together. 

4/ pick a transformer - prefairably a pre-trained one. 

5/ Output layer with the concatonated vectors and the datasets labels.


# Feature Engineering

## Location

In [None]:
location = train['location'].astype('string')

### NER LOC & GPE Identification

In [None]:
nlp = spacy.load("en_core_web_sm")   

doc_lst = []

for l in location:
    if pd.isna(l):
        doc_lst.append(l)
    else:
        doc = nlp(l)
        doc_lst.append(doc)

In [None]:
# check results
for i in doc_lst[0:100]:
    if pd.isna(i):
        'do nothing'
    else:
        print([(X.text, X.label_) for X in i.ents])

In [None]:
# try with a different model
trf = spacy.load("en_core_web_lg") 

doc_lst_trf = []

for l in location:
    if pd.isna(l):
        doc_lst_trf.append(l)
    else:
        doc = trf(l)
        doc_lst_trf.append(doc)
        
for i in doc_lst_trf[0:100]:
    if pd.isna(i):
        'do nothing'
    else:
        ## print([(X.text, X.label_) for X in i.ents])
        print(i.text , i. )

Alot of locations are being identified as org's. From spot checking the accuracy of this method doesn't seem great. Perhaps some rule based matching.

### Rules based country , city & state extraction

In [None]:
cities = pd.read_csv('/Users/geraldoflanagain/Downloads/worldcities.csv')
cities.head()                      

In [None]:
def geo_like (source_lst ,geo_lst ):
    dest_lst = []
    
    compiled_regex = [re.compile(r'(?<![^\W\d_])' + re.escape(x) + r'(?![^\W\d_])', re.IGNORECASE) for x in geo_lst]
    
    for i in source_lst:
        if pd.isna(i):
            dest_lst.append(None)
        else:
            row_gp_lst = [x for x, regex in zip(geo_lst, compiled_regex) if regex.search(i)]
            if not row_gp_lst :
                dest_lst.append(None)
            else:
                dest_lst.append(row_gp_lst)

    return dest_lst 

In [None]:
def find_long (dest_lst):
    dest_lst_2 = []
    for i in dest_lst:
        if i == None:
            dest_lst_2.append(None)
        else:
            dest_lst_2.append(max(i , key=len))
    return dest_lst_2

In [None]:
## country
# the list of countries from the cities dataset doesn't give variations on country names, e.g United States , USA ect. 
# there probably are datasets avaialble that would cover most to of the common purmutations.

country_lst = cities['country'].unique()

country = find_long(geo_like(location , country_lst))

In [None]:
##  city

city_lst = cities['city'].unique()
city = find_long(geo_like(location , city_lst))

In [None]:
## state 

states = pd.read_csv('/Users/geraldoflanagain/Downloads/states.csv')

states_name_lst = states['State'].unique()

states_abv_lst = states['Abbreviation'].unique()

state_name = find_long(geo_like(location , states_name_lst))

state_abv = find_long(geo_like(location , states_abv_lst ))

In [None]:
states.head()

In [None]:
## add to test dataset

train['country'] = country
train['city'] = city
train['state'] = state_name
train['state_abv'] = state_abv

In [None]:
train[(train['location'].isna() ==False)].head()

In [None]:
# fill in blank countries where the city has been identified
singilton = cities.groupby('city')["country"].nunique().loc[lambda x: x==1].sort_values()

city_country = cities.merge(singilton , how = 'inner' , left_on ='city' , right_on = 'city')[["city" , "country_x"]].drop_duplicates()

train = train.merge(city_country , how ='left' , left_on = 'city', right_on = 'city'  )

train['country'] = train['country'].fillna(train['country_x'])


In [None]:
# fill in blank countries where the state has been identified 
train['country'] = train[(train['state'].isna() == False) | (train['state_abv'].isna() == False)]['country'].fillna("United States")

# create one state column with the two letter code

In [None]:
train.loc[( train['location'].isna() == False)].head()

## Key word extraction
Using the values already populated  in the keyword column, train a model to extract keywords for the null values.

In [None]:
[index for index,value in enumerate(no_of_words) if value 
 > 1]

There is just one keyword value per row.

In [None]:
train[(train['keyword'].isna() ==True)].groupby('target').count()

As its just 61 rows i will remove them from the dataset.

In [None]:
train = train['keyword'].dropna()

In [None]:
train.describe()

its not clear which is better to use, something like word2vec which gives a single vector for each word or to use a transformer model like bert. The problem there is clearly that bert splits the  words down into sub tokens, which doesn't seem like the best information extraction for a single word feature.

In [None]:
from gensim.test.utils import common_texts
from gensim.sklearn_api import W2VTransformer

# Create a model to represent each word by a 10 dimensional vector.
model = W2VTransformer(size=10, min_count=1, seed=1)

# What is the vector representation of the word 'graph'?
wordvecs = model.fit(common_texts).transform(['graph', 'system'])

# I think i need a customer transformer in order to be able to aply this ti just one feature
# https://www.kaggle.com/code/tarekyahia/word2vec-custom-column-transformer-pipelines
import gensim

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
class preprocess_s2v(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        self.model = gensim.models.Word2Vec(X,
                                   vector_size=150,
                                   window=5,
                                   min_count=1)
        self.words =  set(self.model.wv.index_to_key)
        return self
    def transform(self, X):
        X_vecs = np.array([np.array([self.model.wv[i] for i in ls if i in self.words]) for ls in X], dtype=object)
        X = np.array([vs.mean(axis = 0) if vs.size else np.zeros(100, dtype=float) for vs in X_vecs])
        return np.array(X)

# Train Classifier

## bert-base-uncased

In [None]:
class tokenizer( BaseEstimator,TransformerMixin):
    def __init__(
        self,
    ):
        self.pre_trained = BertTokenizer.from_pretrained("bert-base-uncased")
        self.add_special_tokens = True
        
    def _tokenize(self, text :str) :
        tokenized = self.pre_trained.encode_plus(
            text,
            add_special_tokens = self.add_special_tokens,
            max_length = 512, 
            )
        return (
            torch.tensor(tokenized["input_ids"]).unsqueeze(0),
            torch.tensor(tokenized["attention_mask"]).unsqueeze(0),
        )
    
    def transform ( self, X):
        text = X.tolist()
        with torch.no_grad():
            X = [self._tokenize(string) for string in text]
            #step1_out = step1_out.values
            return X

    def fit( self, X, y=None):
        return self

In [None]:
class bertmodel(BaseEstimator,TransformerMixin):
    def __init__(
        self
    ):
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
    
    def _berty (self , tolkens , attention_mask):
        with torch.no_grad():
          embeddings = self.bert_model(tolkens, attention_mask = attention_mask)
        last_hidden_state = embeddings[0]
        get_cls = last_hidden_state[:, 0, :]
        
        return get_cls 
    def transform ( self, X):
        with torch.no_grad():
            return torch.stack([self._berty(tolkens , attention_mask) for tolkens , attention_mask in X])[:, 0, :]

    def fit(self, X, y=None):
        return self


In [None]:
train_x, test_x, train_y, test_y = train_test_split(train['text'], train['target'], test_size=0.2, random_state=42)

questions left:
If i just use the cls token does that capture multi sentenance tweets correctly?/

how does the sklearn pipeline know what to pass as an output from one step to the inputs of the next step/

can i use udf's instead of class's for the pipeline steps?/
how do you navigate through a tensors structure / how does a tensor work?/
should i be using the attention mask or is it being used by default?

Do i need to pre initialise the estimators or do it in the fit method of each step?

Should I be using the fit or fit_transpform methods of the pipeline?

The classes work individually and together, outside the pipeline. It's the bastard pipeline thats' making stringing the stes together difficult. Perhaps this does suggest something to do with the initialisation.

https://medium.com/@benlc77/how-to-write-clean-and-scalable-code-with-custom-transformers-sklearn-pipelines-ecb8e53fe110


In [None]:
classifier = svm.LinearSVC(C=1.0, class_weight="balanced")
bertmodel = bertmodel()
tokenizer = tokenizer()

In [None]:
model = pipeline.Pipeline(
    [
        ("tokenizer",  tokenizer ),
        ("vectorizer", bertmodel ),
        ("classifier", classifier),
    ]
)
model.fit(train_x, train_y)

In [None]:
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
y_pred = model.predict(test_x)  

In [None]:
disp = ConfusionMatrixDisplay.from_predictions(test_y, y_pred)
#disp.plot()
plt.show()
print('Precision: %.3f' % precision_score(test_y, y_pred))
print('Recall: %.3f' % recall_score(test_y, y_pred))
print('F1: %.3f' % f1_score(test_y, y_pred))
print('Accuracy: %.3f' % accuracy_score(test_y, y_pred))

## TwHIN-BERT

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('Twitter/twhin-bert-base')
model = AutoModel.from_pretrained('Twitter/twhin-bert-base')
inputs = tokenizer("I'm using TwHIN-BERT! #TwHIN-BERT #NLP", return_tensors="pt")
outputs = model(**inputs)

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV

### References

https://towardsdatascience.com/build-a-bert-sci-kit-transformer-59d60ddd54a5

https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder

@article{zhang2022twhin,
  title={TwHIN-BERT: A Socially-Enriched Pre-trained Language Model for Multilingual Tweet Representations},
  author={Zhang, Xinyang and Malkov, Yury and Florez, Omar and Park, Serim and McWilliams, Brian and Han, Jiawei and El-Kishky, Ahmed},
  journal={arXiv preprint arXiv:2209.07562},
  year={2022}
}

https://towardsdatascience.com/pre-processing-should-extract-context-specific-features-4d01f6669a7e

tokenization:
https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils.py
https://github.com/google-research/bert/blob/master/tokenization.py
none the wiser on how the special tokens handels #, im guessing it doesn't extract the semantic meaning.

https://towardsdatascience.com/the-ultimate-guide-to-training-bert-from-scratch-the-tokenizer-ddf30f124822

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://datasciencetoday.net/index.php/en-us/nlp/211-paper-dissected-bert-pre-training-of-deep-bidirectional-transformers-for-language-understanding-explained

https://lifewithdata.com/2023/05/27/transformermixin-in-scikit-learn/