# Import libraries:

In [2]:
########################################
# General:
########################################

import pandas as pd, numpy as np, os, sys, copy
from tqdm import tqdm

########################################
# String cleaning:
########################################
import unidecode, re
from nltk.stem.snowball import SnowballStemmer

########################################
# For the custom pipeline architecture:
########################################
import itertools
# from itertools import chain, itertools
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin # to define class for use in pipeline

##########
# Tokenisation
##########
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 

##########
# Dimensionality reduction
##########
from sklearn.decomposition import TruncatedSVD, NMF # PCA will not work on sparse matricies

##########
# Classification
##########
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV

##########
# Pipeline
##########
from sklearn.model_selection import GridSearchCV, StratifiedKFold

##########
# Measuring performance
##########
from sklearn.metrics import average_precision_score, make_scorer, confusion_matrix, precision_recall_fscore_support

# Define functions

## Clean Strings

In [3]:
########################################
## Clean text strings function
########################################

def to_latin(X):
    """
    Remove non-European characters whilst keeping accented european characters from pandas column
    :param X: Pandas column of raw strings
    :return X: Pandas column of cleaned strings
    """
    X = list(map(lambda x: x.encode("latin1", errors="ignore").decode('latin1'), X))
    return(X)

def replace_accents(X):
    """
    Replace accented characters with non-accented characters
    :param X: Pandas column of raw strings
    :return X: Pandas column of cleaned strings
    """
    X = list(map(lambda x: unidecode.unidecode(x), X))
    return(X)

def remove_short_strings(X):
    """
    Remove 1-2 letter words 
    :param X: Pandas column of raw strings
    :return X: Pandas column of cleaned strings
    """    
    X = list(map(lambda x: re.sub(r'\b\w{1,2}\b',' ', x), X)) # remove 1-2 letter words 
    return(X)

def remove_punctuation(X, remove_tildas = True, remove_numerics = False):
    """
    Replace all non-alphanumeric characters from a string, replacing them with a space. 
    Option as to whether to remove tildas (~) or not.
    :param X: Pandas column of raw strings
    :param remove_tildas: Boolean, whether to remove tildas or not
    :return X: Pandas column of cleaned strings
    """    
    if remove_tildas and remove_numerics:
        X = list(map(lambda x: re.sub(r'[^a-zA-Z\s]',' ', x), X)) # replace all non-alphanumeric characters with a space
    elif remove_tildas:
        X = list(map(lambda x: re.sub(r'[^\w\s]',' ', x), X)) # replace all non-alphanumeric characters with a space
    elif remove_numerics:
        X = list(map(lambda x: re.sub(r'[^a-zA-Z\s\~]',' ', x), X)) # replace all non-alphabet characters except tildas with a space        
    else:
        X = list(map(lambda x: re.sub(r'[^\w\s\~]',' ', x), X)) # replace all non-alphanumeric characters except tildas with a space
    return(X)

def stem_strings(X):
    """
    Remove English stopwords from string (as defined by SnowballStemmer)
    :param X: Pandas column of raw strings
    :return X: Pandas column of cleaned strings
    """    
    stemmer = SnowballStemmer("english", ignore_stopwords=False)
    X = [\
        [' '.join([stemmer.stem(word) for word in x.split()])][0]\
         for x in np.array(X).astype(str)
        ]
    return(X)   


def clean_strings(X
                  , to_latin_str = True
                  , replace_accents_str = True
                  , remove_short_strings_str = True
                  , remove_tildas_str = True
                  , stemming_str = True
                  , verbose = False):
    """
    Combination of functions for a list of strings: 
    - Removes non-European characters
    - Replaces accented characters with non-accented characters
    - Removes 1-2 letter words
    - Replaces non-alpha-numeric characters with whitespace
    - Removes English stopwords from string (as defined by SnowballStemmer)
    :param X: Pandas column of strings
    :param remove_tildas: Boolean, whether to remove tildas or not
    :param stemming: Boolean, whether to stem the words or not (do not use before translating)
    :param verbose: whether to print when it finishes/comments
    :return X: Dataframe of labelled data
    """
    if to_latin_str: # remove chinese characters, keep accented european characters
        X = to_latin(X) 
    if replace_accents_str: # replace accented characters with non-accented characters
        X = replace_accents(X) 
    if remove_short_strings_str:
        X = remove_short_strings(X) # remove 1-2 letter words 
    if remove_tildas_str:
        X = remove_punctuation(X, remove_tildas = remove_tildas_str) # remove non-alpha-numeric characters (potentially except tildas)
    if stemming_str:
        X = stem_strings(X) # remove English stopwords from string (as defined by SnowballStemmer)
    if verbose:
    	print(time.strftime('%d/%m/%Y %H:%M:%S') + ' Menu item strings cleaned')
    return(X)    

## Tf-idf

In [4]:
class custom_tfidf(BaseEstimator, TransformerMixin):
    
    def __init__(self
                 , X
                 , string_delimeter = ' ~~ '
                 , ngram_range= (1,2)
                 , min_df = 1
                 , max_df = 1.0
                 , norm = 'l2'
                 , use_idf = True
                 , smooth_idf = True
                 , sublinear_tf = True):
        """
        Stems and create vocabulary based on delimeter; run text-frequency inverse-document-frequency (tf-idf) 

        :param X: array of strings
        :param string_delimeter: array of strings
        :param ngram_range: tuple of how many words to include in each tokenisation (e.g. (1,1) is unigram, (1,2) is bigram etc). See sklearn.feature_extraction.text.CountVectorizer for more details)
        :param min_df: minimum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
        :param max_df: maximum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
        :param norm: array of rx_ids
        :param use_idf: boolean, whether to use inverse-document-frequency in addition to just text-frequency  
        :param smooth_idf: boolean, whether to add one to denominator in idf step to prevent div/0 errors (see sklearn.feature_extraction.text.CountVectorizer for more details)
        :param sublinear_tf: boolean, whether to also log-transform the text-frequency step (see sklearn.feature_extraction.text.CountVectorizer for more details) 
        """
        self.train_X = X
        self.string_delim = string_delimeter
        self.ngram = ngram_range
        self.min_df = min_df
        self.max_df = max_df
        self.norm = norm
        self.use_idf = use_idf
        self.smooth_idf = smooth_idf
        self.sublinear_tf = sublinear_tf
        
    def fit(self, *_): # kwargs # fit()
        """
        Create vocabulary dictionary (prevents bi+-gramming over seperate documents)
        """
        split_strings = list(itertools.chain.from_iterable([re.split(self.string_delim, x) for x in self.train_X]))
        stemmed = clean_strings(split_strings)
        count_vec = CountVectorizer(strip_accents = 'unicode'
                                    , analyzer = 'word'
                                    , stop_words = 'english'
                                    , lowercase = True
                                    , min_df = self.min_df , max_df = self.max_df
                                    , ngram_range = self.ngram)
        vocab = count_vec.fit(stemmed)
        self.vocab = vocab.vocabulary_
        return(self)
    
    def transform(self, X): # kwargs # transform
        """
        Run tf-idf using vocab
        """        
        stemmed = clean_strings(X)
        tfidf_vec = TfidfVectorizer(strip_accents = 'unicode'
                                    , analyzer = 'word'
                                    , stop_words = 'english'
                                    , lowercase = True
                                    , min_df = self.min_df , max_df = self.max_df
                                    , ngram_range = self.ngram
                                    , vocabulary = self.vocab
                                    , norm = self.norm
                                    , use_idf = self.use_idf
                                    , smooth_idf =  self.smooth_idf
                                    , sublinear_tf = self.sublinear_tf)
        vectorized_matrix = tfidf_vec.fit_transform(stemmed)
        return(vectorized_matrix)
    
def tokenisation(train_X, validation_X
                 , string_delimeter = ' ~~ '
                 , ngram_range= (1,2)
                 , min_df = 1
                 , max_df = 1.0
                 , norm = 'l2'
                 , use_idf = True
                 , smooth_idf = True
                 , sublinear_tf = True):
    ct = custom_tfidf(train_X)
    a = ct.fit_transform(train_X)
    b = ct.transform(validation_X)
    return({'fit': ct, 'train_X': a, 'validation_X': b})    

## Dimensionality reduction

In [5]:
def dimensionality_reduction(train_X, validation_X
                             , n_comp = None):
    if n_comp == None:
        return({"train_X": train_X, "validation_X": validation_X})
    else:
        c = TruncatedSVD(n_components=n_comp).fit(train_X)
        d = c.fit_transform(train_X)
        d2 = c.transform(validation_X)
        return({'fit': c, 'train_X': d, 'validation_X': d2})

## Classification

In [6]:
def interim_results(y, y_pred):
    """
    Assess performance of y_hat vs y

    :param y: array of actual labels
    :param y_pred: array of predicted labels
    :return z: pandas DataFrame, specifying precision, recall and f1 score
    """
    z = pd.DataFrame({'class': y.sort_values().unique()
                      ,'precision': precision_recall_fscore_support(y, y_pred, warn_for = ())[0]
                      ,'recall': precision_recall_fscore_support(y, y_pred, warn_for = ())[1]
                      ,'f1_score': precision_recall_fscore_support(y, y_pred, warn_for = ())[2]                      
                     })
    return(z)    

def classification(train_X, train_y, validation_X, validation_y
                   , classifier = [LinearSVC(class_weight = 'balanced')]):
    e = classifier.fit(X = train_X, y = train_y)
    y_hat = e.predict(X = validation_X)
    results = interim_results(validation_y, y_hat)
    mean_f1_score = np.mean(results['f1_score'])
    return({'fit': e, 'prediction': y_hat, 'results': results, 'mean_f1_score': mean_f1_score})    

## Pipeline architecture

In [7]:
def cross_val(X, y, n_splits, shuffle = False):
    splits = dict()
    counter = 0
    skf = StratifiedKFold(n_splits=n_splits)
    for train_index, cv_index in skf.split(X = X, y = y):
        train_X = X.iloc[train_index,].copy()
        train_y = y.iloc[train_index,].copy()
        validation_X = X.iloc[cv_index,].copy()
        validation_y = y.iloc[cv_index,].copy()
        splits[counter] = {'train_X': train_X, 'train_y': train_y, 
                           'validation_X': validation_X, 'validation_y': validation_y}
        counter += 1
    return(splits)

def custom_text_class_pipeline(X
                               , y
                               , cv_splits=2
                               , ngram = [(1,2)]
                               , min_df = [1], max_df = [1.0]
                               , norm = ['l2']
                               , use_idf = [True]
                               , dim_reduc = [50]
                               , classifiers = [LinearSVC(class_weight = 'balanced')]):
    """
    Run pipeline

    :param X: array of strings to train on
    :param y: array of correct labels
    :param cv_splits: number of cross-validation splits to run
    :param ngram_range: tuple of how many words to include in each tokenisation (e.g. (1,1) is unigram, (1,2) is bigram etc). See sklearn.feature_extraction.text.CountVectorizer for more details)
    :param min_df: minimum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
    :param max_df: maximum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
    :param norm: array of rx_ids
    :param use_idf: boolean, whether to use inverse-document-frequency in addition to just text-frequency  
    :param smooth_idf: boolean, whether to add one to denominator in idf step to prevent div/0 errors (see sklearn.feature_extraction.text.CountVectorizer for more details)
    :param sublinear_tf: boolean, whether to also log-transform the text-frequency step (see sklearn.feature_extraction.text.CountVectorizer for more details) 
    """
    params = {'ngram': ngram, 'min_df': min_df, 'max_df': max_df, 'norm': norm, 'use_idf': use_idf}
    params = pd.DataFrame(list(itertools.product(*params.values())), columns = params.keys())
    
    cross_validation = cross_val(X, y, n_splits=cv_splits)
    
    pbar = tqdm(total=params.shape[0]*len(dim_reduc)*len(classifiers)*cv_splits)
    
    all_results = pd.DataFrame()
    # all_results = pd.DataFrame({ 'k-fold': [""], 'tf-idf': [""], 'tf-idf parameters':[""], 'dimensionality reduction': [""], 'dim_reduc paramters': [""], 'classifier': ""}, )

    for k in cross_validation:
        for row in range(params.shape[0]):
            X = tokenisation(cross_validation[k]['train_X']
                             , cross_validation[k]['validation_X'] 
                             , ngram_range = params['ngram'][row]
                             , min_df = params['min_df'][row]
                             , max_df = params['max_df'][row]
                             , norm = params['norm'][row]
                             , use_idf = params['use_idf'][row])
            for comp in dim_reduc:
                X2 = dimensionality_reduction(X['train_X'], X['validation_X']
                                              , n_comp = comp)
                for j in classifiers:
                    classifier_result = classification(X2['train_X']
                                                       , cross_validation[k]['train_y']
                                                       , X2['validation_X']
                                                       , cross_validation[k]['validation_y']
                                                       , classifier = j)
                    everything = {'k-fold': k
                                  , 'tf-idf': copy.deepcopy(X['fit'])
                                  , 'tf-idf parameters': params.loc[row,:]
                                  , 'dimensionality reduction': copy.deepcopy(X2['fit'])
                                  , 'dim_reduc paramters': comp
                                  , 'classifier': copy.deepcopy(classifier_result['fit'])
                                  , 'results': classifier_result['results']
                                  , 'validation_y_hat': classifier_result['prediction']
                                  , 'train_X': cross_validation[k]['train_X'] 
                                  , 'validation_X': cross_validation[k]['validation_X'] 
                                  , 'train_y': cross_validation[k]['train_y'] 
                                  , 'validation_y': cross_validation[k]['validation_y']                                   
                                  , 'mean_f1_score': np.mean(classifier_result['mean_f1_score'])}
                    all_results = all_results.append(everything, ignore_index = True)
                    pbar.update()
    pbar.close()
    return(all_results)

def extrapolate(X, results_row):
    return(extrapolation.classifier.predict(
           extrapolation['dimensionality reduction'].transform(
           extrapolation['tf-idf'].transform(all_data.translated)
           )))

# First pass with data

In [33]:
df = pd.read_clipboard(sep = '\t')
df.fillna('', inplace=True)
df.columns = [re.sub('[^a-zA-Z0-9]+', '_', x.lower()) for x in df.columns]
df2 = df.loc[df.body != '',:]

In [34]:
df2.gene_therapy_.value_counts()

False    64
True     35
Name: gene_therapy_, dtype: int64

In [35]:
results = custom_text_class_pipeline(X = df2.body.map(str)
                                     , y = df2.gene_therapy_
                                     , cv_splits = 5
                                     , dim_reduc = [50, 100, 150])

100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


In [36]:
results['mean_f1_score']

0     0.523810
1     0.561129
2     0.561129
3     0.375000
4     0.375000
5     0.375000
6     0.641577
7     0.641577
8     0.641577
9     0.943020
10    0.943020
11    0.943020
12    0.840336
13    0.834783
14    0.834783
Name: mean_f1_score, dtype: float64

In [37]:
results.loc[9,'results']

Unnamed: 0,class,f1_score,precision,recall
0,False,0.962963,0.928571,1.0
1,True,0.923077,1.0,0.857143


In [63]:
pd.concat([pd.Series(results.loc[9, 'validation_X']).reset_index(drop = True)
           , pd.Series(results.loc[9, 'validation_y']).reset_index(drop = True)
           , pd.Series(results.loc[9, 'validation_y_hat']).reset_index(drop = True)]
          , axis=1).to_csv('results_v1.csv')

In [None]:
def extrapolate(X, results_row):
    return(extrapolation.classifier.predict(
           extrapolation['dimensionality reduction'].transform(
           extrapolation['tf-idf'].transform(all_data.translated)
           )))

In [None]:
extrapolate(df.ab)