# Tech Classification Prototype

## Setup training

In [1]:
import pandas as pd

### Import

In [2]:
df = pd.read_csv('csv/tech_no_tech.csv')
df.columns = ['id', 'label', 'abstract']
df.head()

Unnamed: 0,id,label,abstract
0,22368089,N,The Cat-301 monoclonal antibody identifies agg...
1,30549480,N,Objective: To characterize the prevalence of m...
2,30534812,N,"We conducted a retrospective study, between 20..."
3,30532051,Y,Antipsychotic (AP) drugs are used to treat psy...
4,30531921,Y,Neural prostheses decode intention from cortic...


In [3]:
df.label.value_counts()

N    137
Y     77
Name: label, dtype: int64

### Text cleaning

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
import unidecode, re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
def clean_string(x
                 , regex_string = ['[^\w\s]','\\n']
                 , replacement = ' '):
    x = x.lower()
    x = x.encode("latin1", errors="ignore").decode('latin1')
    x = unidecode.unidecode(x)
    for i in regex_string:
        x = re.sub(i, replacement, x)
    x = re.sub(' +', ' ', x).strip()
    return(x)

def tokenize(x, delimeter = ' '):
    x = x.split(sep = delimeter)
    return(x)

def clean_tokens(x
                 , stop_words = stopwords.words('english')
                 , min_string_length = 2
                 , stem = True
                 , lemmatize_pos = None
                 , sort=True):
    x = [w for w in x if w not in stop_words]
    x = [w for w in x if len(w) >= min_string_length]
    if stem:
        x = [SnowballStemmer("english", ignore_stopwords=False).stem(w) for w in x]
    if lemmatize_pos is not None: # lemmatize_pos = 'v'
        x = [WordNetLemmatizer().lemmatize(w, pos=lemmatize_pos) for w in x]
    if sort:
        x = sorted(x)
    return(x)
    
def clean_all(x
              , regex_string = ['[^\w\s]','\\n']
              , replacement = ' '
              , delimeter = ' '
              , stop_words = stopwords.words('english')
              , min_string_length = 2
              , stem = True
              , lemmatize_pos = None
              , sort=False):
    x = clean_string(x
                     , regex_string = regex_string
                     , replacement = replacement)
    x = tokenize(x
                , delimeter = delimeter)
    x = clean_tokens(x
                     , stop_words = stop_words
                     , min_string_length = min_string_length
                     , stem = stem
                     , lemmatize_pos = lemmatize_pos
                     , sort = sort)
    return(x)

In [6]:
## building class so later we can add to the text pipeline

class StringClean(BaseEstimator, TransformerMixin):    
    def __init__(self
                 , regex_string = ['[^\w\s]','\\n']
                 , replacement = ' '
                 , stop_words = stopwords.words('english')
                 , min_string_length = 2
                 , stem = True
                 , lemmatize_pos = None):
        self.regex_string = regex_string
        self.replacement = replacement  
        self.stop_words = stop_words
        self.min_string_length = min_string_length
        self.stem = stem
        self.lemmatize_pos = lemmatize_pos
        
    def fit(self, X, y=None):
        return(self)

    def transform(self, X, y=None):
        X = [clean_all(x) for x in X]
        return(X)

In [7]:
df['text'] = StringClean().fit_transform(df.abstract)

## Unsupervised structuring

In [8]:
import numpy as np

from scipy.sparse import csr_matrix, hstack # for stacking dimensionality reduction matricies

from gensim.models.doc2vec import TaggedDocument as td
from gensim.models import Doc2Vec as d2v #, phrases as bigram # Use sklearn tfidf vectorizer instead, as ngram > 2

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import TruncatedSVD, NMF # PCA will not work on sparse matricies

from sklearn.base import BaseEstimator, TransformerMixin

### Choose columns to extract

In [9]:
class col_chooser(BaseEstimator, TransformerMixin):
    
    """Choose which heterogeneous feature to feed into the pipeline"""
    
    def __init__(self, key = ''):
        self.key = key
        
    def fit(self, X, y=None):
        return(self)
    
    def transform(self, X):
        try:
            return(X[self.key])
        except:
            return(None)

### TF-IDF + Truncated SVD

In [10]:
class tfidf_tsvd_struct_pipeline(BaseEstimator, TransformerMixin):

    """Create TF_IDF vectorized features from text"""
    
    def __init__(self, n_components=100, norm='l2', ngram_range=(1, 2), preprocessor = ' '.join):
        self.n_components = n_components
        self.norm = norm
        self.ngram_range = ngram_range
        self.preprocessor = preprocessor
        
    def fit(self, X, *_, **args):
        if X is not None:
            self.tf_idf = TfidfVectorizer(norm = self.norm
                                          , ngram_range = self.ngram_range
                                          , preprocessor = self.preprocessor
                                          , **args).fit(X) # norm='l2', ngram_range=(1, 2)
            # self.n_components = n_components
            if (self.n_components != None):
                self.t_m = TruncatedSVD(self.n_components).fit(self.tf_idf.transform(X))
            return(self)
        else:
            return(self)

    def transform(self, X, *_):
        if self.tf_idf is not None:
            if (self.n_components != None):
                t_m = csr_matrix(self.t_m.transform(self.tf_idf.transform(X)))
            else:
                t_m = csr_matrix(self.tf_idf.transform(X))
            return(t_m)
        else:
            return(None)

### Doc2Vec

In [11]:
class d2v_struct_pipeline(BaseEstimator, TransformerMixin):

    """Create D2V vectorized features from text"""
    
    # https://arxiv.org/pdf/1405.4053v2.pdf
    # https://arxiv.org/pdf/1301.3781.pdf
    
    # https://medium.com/@amarbudhiraja/understanding-document-embeddings-of-doc2vec-bfe7237a26da
    
    def __init__(self, vector_size=100, window=10, min_count=1, dm=1): # learning_rate=0.02, epochs=20
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.dm = dm
#         self.learning_rate = learning_rate
#         self.epochs = epochs
    
    def fit(self, X, *_, **args):
        tagged_docs = list(map(lambda i, line: td(line, [i]), list(range(len(X))), X))
        self.d2v_dm = d2v(tagged_docs
                          , vector_size=self.vector_size
                          , window=self.window
                          , min_count=self.min_count
                          , dm=self.dm
                          , **args)
        return self

    def transform(self, X, *_):
        d2v_dm_m = [self.d2v_dm.infer_vector(x) for x in X]
        return(csr_matrix(d2v_dm_m))

## Gridsearch Pipeline (should use randomized search)

In [None]:
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer, precision_score, recall_score

from sklearn.calibration import CalibratedClassifierCV

### Define pipeline

In [None]:
## PERMIT MULTIPLE UNSUPERVISED EMBEDDING OPTIONS
unsupervised_union = \
FeatureUnion([("tfidf_svd", tfidf_tsvd_struct_pipeline(preprocessor = ' '.join))
              , ("d2v1", d2v_struct_pipeline(dm=1))
              , ("d2v0", d2v_struct_pipeline(dm=0))])

## ALLOW VARYING NUMBER OF TEXT FIELDS
text_inputs = \
FeatureUnion([('text_1', Pipeline([('col', col_chooser(key = 'text')) # key = 'abstract'
                                   # , ('str_clean', StringClean())
                                   , ('comb', unsupervised_union)]))
#               ,('text_2', Pipeline([('col', col_chooser(key = 'stem_rx_description'))
#                                     , ('comb', unsupervised_union)]))
#               ,('text_3', Pipeline([('col', col_chooser())
#                                     , ('comb', unsupervised_union)]))
#               ,('text_4', Pipeline([('col', col_chooser())
#                                     , ('comb', unsupervised_union)]))
             ])

## CLASSIFICATION
classifier = CalibratedClassifierCV(LinearSVC(class_weight = 'balanced')) # SVC(kernel="linear")

# PUT IT IN PIPELINE
pipeline = Pipeline([("features", text_inputs)
                     , ("classifer", classifier)]) # memory=cachedir

# DEFINE PERFORMANCE METRICS
scoring = {'f1_macro': 'f1_macro'
           , 'roc_curve': 'roc_auc'
           , 'precision': 'precision_macro'
           , 'recall': 'recall_macro'}

### Define gridsearch parameters

In [None]:
# import sagemaker
# from sagemaker import get_execution_role
# from sagemaker.sklearn import SKLearn

# sagemaker_session = sagemaker.Session()
# role = get_execution_role()
# instance_type = "ml.m4.xlarge"

# train_data_location = sagemaker_session.upload_data(
#     path='../csv/tech_no_tech.csv', key_prefix="data"
# )

# sagemaker_session.default_bucket()

In [None]:
# GET NAMES OF PARAMETERS FOR GRIDSEARCH
# sorted(pipeline.get_params().keys())

# DEFINE GRIDSEARCH
param_grid = dict(  features__text_1__comb__tfidf_svd__n_components=[400]
                  , features__text_1__comb__d2v1__vector_size=[300]
                  , features__text_1__comb__d2v0__vector_size=[400]
#                   , features__transformer_weights = [{'text_1': 1
#                                                       , 'text_2': 1
#                                                       , 'text_3': 0
#                                                       , 'text_4': 0}]
                 )

grid_search = GridSearchCV(pipeline
                           , param_grid=param_grid
                           , cv=5
                           , scoring = scoring
                           , refit='roc_curve' #, refit = False # 
                           , return_train_score=True
                           , verbose=False)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
train_test_split(df,df['label'], test_size=0.2, random_state=42)

In [None]:
gs_fit = grid_search.fit(X = X_train.to_records()
                         , y = y_train)

In [None]:
cv_results = pd.DataFrame(gs_fit.cv_results_ )
cv_results[['params'] + [x for x in list(cv_results.columns) if 'mean' in x and 'test' in x]]

In [None]:
cv_results[[x for x in list(cv_results.columns) if 'split' in x and 'test_roc_curve' in x]]

In [None]:
## GET BEST PREDICTED CLASS

# df['prediction'] = gs_fit.predict(df)

X_test['prediction'] = gs_fit.predict(X_test)

In [None]:
## GET PROBABILITIES PER CLASS

# df[list(gs_fit.classes_)] = \
# pd.DataFrame(gs_fit.predict_proba(df)).reset_index(drop = True)

# X_test[list(gs_fit.classes_)] = gs_fit.predict_proba(X_test)
X_test['prob_a'] = [x[0] for x in gs_fit.predict_proba(X_test).tolist()]
# pd.DataFrame(
#).reset_index(drop=True)

In [None]:
sum(X_test.label == X_test.prediction)/X_test.shape[0]

In [None]:
X_test.to_csv('csv/tech_test_results.csv')