# Setup

### Import libraries:

In [1]:
########################################
# General:
########################################

import pandas as pd, numpy as np, time

########################################
# String cleaning:
########################################
import unidecode, re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

########################################
# Embedding:
########################################

from gensim.utils import tokenize as tk
from gensim.models import Word2Vec as w2v, FastText as ft, Doc2Vec as d2v, phrases as bigram
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer 
from sklearn.decomposition import TruncatedSVD, NMF # PCA will not work on sparse matricies

### Data import

In [2]:
df = pd.read_csv('csv/abstracts_concat.csv', index_col = 'pubmed_paper_id')

In [3]:
df = df.loc[~df['abstract'].isna(),:]

### Text cleaning

In [4]:
# corpus = [list(tk(i, lowercase=True, deacc=True)) #  encoding='utf8'
#           for i in df['abstract']]
# df['corpus'] = corpus

In [5]:
########################################
## Clean text strings function
########################################

def remove_short_strings(X, max_characters = 2, lower_case_only = True):
    """
    Remove 1-2 letter words in list
    :param X: List of raw strings
    :param max_characters: Maximum size of string to remove
    :return X: List of cleaned strings
    """    
    if lower_case_only:
        regex_string = r'\b[a-z]{1,%s}\b' % (max_characters)    
    else:
        regex_string = r'\b\w{1,%s}\b' % (max_characters)
    X = list(map(lambda x: re.sub(regex_string,' ', x), X)) # remove 1-2 letter words 
    return(X)

def to_lower(X):
    """
    Remove non-European characters whilst keeping accented european characters in list
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: x.lower(), X))
    return(X)

def to_latin(X):
    """
    Remove non-European characters whilst keeping accented european characters in list
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: x.encode("latin1", errors="ignore").decode('latin1'), X))
    return(X)

def replace_accents(X):
    """
    Replace accented characters with non-accented characters in list
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: unidecode.unidecode(x), X))
    return(X)

def remove_punctuation_regex(remove_tildas = True, remove_numerics = False):
    """
    Write regex to replace all non-alphanumeric characters, replacing them with a space.
    Option as to whether to remove tildas (~) or numerical values not.
    :param remove_tildas: Boolean, whether to remove tildas or not
    :param remove_numerics: Boolean, whether to remove numerics or not    
    :return X: Regex to do this
    """    
    if remove_tildas and remove_numerics:
        regex_string = r'[^a-zA-Z\s]' # replace all non-alphabet characters with a space
    elif remove_tildas:
        regex_string = r'[^\w\s]' # replace all non-alphanumeric characters with a space
    elif remove_numerics:
        regex_string = r'[^a-zA-Z\s\~]' # replace all non-alphabet characters except tildas with a space        
    else:
        regex_string = r'[^\w\s\~]' # replace all non-alphanumeric characters except tildas with a space
    return(regex_string)

def remove_punctuation(X, regex_string):
    """
    Replace all non-alphanumeric characters in a list of strings, replacing them with a space. 
    Option as to whether to remove tildas (~) or not.
    :param X: List of raw strings
    :param remove_tildas: Boolean, whether to remove tildas or not
    :return X: List of cleaned strings
    """    
    X = list(map(lambda x: re.sub(regex_string,' ', x), X)) 
    return(X)

def tokenise(X, delimeter = None):
    """
    Returns list of lists of strings split by the delimeter
    :param X: List of strings
    :param delimeter: Delimeter to split by
    :return X: List of list of strings
    """
    X = list(map(lambda x: x.split(sep = delimeter), X))
    return(X)

def remove_stopwords(X, stop_words = stopwords.words('english')):
    """
    Returns list of lists of strings split by the delimeter
    :param X: List of strings
    :param delimeter: Delimeter to split by
    :return X: List of list of strings
    """
    # X = list(map(lambda x: [w if w not in stop_words else None for w in x], X))
    X = list(map(lambda x: [w for w in x if w not in stop_words ], X))
    return(X)
    # https://stackoverflow.com/questions/4260280/if-else-in-a-list-comprehension

def stem_strings(X):
    """
    Stems words (shorten algorithmically) them (as defined by SnowballStemmer)
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: SnowballStemmer("english", ignore_stopwords=False).stem(x), X))
    return(X)  

def lemmatize_strings(X, pos = "v"):
    """
    Lemmatize list of strings (as defined by WordNetLemmatizer)
    :param X: List of raw strings
    :param pos: Pos parameter to feed into WordNetLemmatizer().lemmatize function
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: WordNetLemmatizer().lemmatize(x, pos=pos), X))
    return(X)  

def clean_strings(X
                  , remove_short_str_max_char = 2
                  , to_lower_str = True
                  , to_latin_str = True
                  , replace_accents_str = True
                  , regex_string =  r'[^a-zA-Z\s-]' # remove_punctuation_regex(True, False)
                  , tokenise_delimeter = None
                  , stop_words = ''
                  , stemming_str = False
                  , lemma_str = False
                  , lemma_pos = "v"
                  , verbose = False):
    """
    Combination of functions for a list of strings: see parameters
    - Replaces non-alpha-numeric characters with whitespace
    - Remove english stopwords from and strings and stems them (as defined by SnowballStemmer)
    - Lemmatizes english strings (as defined by WordNetLemmatizer) 
    :param X: List of strings
    :param remove_short_str: Numeric, size of small words to remove (if set to 0, no words are removed)
    :param to_latin_str: Boolean, whether to remove non-European characters whilst keeping accented european characters from pandas column
    :param replace_accents_str: Boolean, whether to replace accented characters with non-accented characters
    :param regex_string: String, can add extra regex to find other characters to remove
    :param tokenise_delimeter: String, determines how to split into tokens. Default = None splits by all whitespace
    :param stop_words: List of stopwords to remove from the tokens
    :param stemming_str: Boolean, whether to stem the words or not (do not use before translating) (as defined by SnowballStemmer)
    :param lemma_str: Boolean, whether to lemmatize the words or not (do not use before translating) (as defined by WordNetLemmatizer)
    :param lemma_pos: String, pos parameter to feed into WordNetLemmatizer().lemmatize function
    :param verbose: whether to print when it finishes/comments
    :return X: Dataframe of labelled data
    """
    if remove_short_str_max_char > 0:
        X = remove_short_strings(X, remove_short_str_max_char) # remove 1-2 letter words     
    if to_lower_str: # remove chinese characters, keep accented european characters
        X = to_lower(X)     
    if to_latin_str: # remove chinese characters, keep accented european characters
        X = to_latin(X) 
    if replace_accents_str: # replace accented characters with non-accented characters
        X = replace_accents(X) 
    X = remove_punctuation(X, regex_string)
    X = tokenise(X, tokenise_delimeter)
    X = remove_stopwords(X, stop_words)
    if stemming_str:
        X = list(map(stem_strings, X)) # remove English stopwords from string (as defined by SnowballStemmer)
    if lemma_str:
        X = list(map(lambda x: lemmatize_strings(x, lemma_pos), X)) # remove English stopwords from string (as defined by SnowballStemmer)        
    if verbose:
        print(time.strftime('%d/%m/%Y %H:%M:%S') + ' Menu item strings cleaned')
    return(X)

# https://chrisalbon.com/machine_learning/preprocessing_text/remove_stop_words/

In [6]:
# CLEAN AND ABSTRACTS INTO STEMMED TOKENS
df['clean'] = clean_strings(df.loc[:, 'abstract']
                            , regex_string='[^a-zA-Z\\\\s]'
                            , stop_words = stopwords.words('english')
                            , verbose = True)
df['stem'] = list(map(stem_strings, df['clean']))

30/05/2019 12:51:24 Menu item strings cleaned


In [7]:
# CREATE MAPPING BETWEEN STEMMED TOKENS AND CLEANED WORDS
unique_words = list(set([item for sublist in df['clean'] for item in sublist]))
stemmed_words = stem_strings(unique_words)
stem_to_word = dict(zip(unique_words, stemmed_words))

### TF-IDF unigram word-level scores

In [8]:
# df['stem_concat'] = list(map(lambda x: ' '.join(x), df['stem']))
tf_idf = TfidfVectorizer(preprocessor=' '.join
                         # , token_pattern = '[^ ]+'
#                          , token_pattern = '[^a-zA-Z\\\\s]'
                        ) # norm=None # IF WANT ACTUAL WORD COUNTS, RATHER THAN L2
tf_idf_m = tf_idf.fit_transform(df['stem'])
tf_m = tf_idf_m.multiply(1/tf_idf.idf_)

In [9]:
df_vocab = pd.DataFrame(data = (tf_m != 0).sum(axis = 0).tolist()[0]
                        , index = tf_idf.get_feature_names()
                        , columns=['total_docs_inc'])

df_vocab['total_tf'] = tf_m.sum(axis=0).tolist()[0]
df_vocab['avg_nonzero_tf'] = tf_m.sum(axis=0).tolist()[0]/df_vocab['total_docs_inc']
df_vocab['idf'] = tf_idf.idf_
df_vocab['avg_nonzero_tfidf'] = tf_idf_m.sum(axis=0).tolist()[0]/df_vocab['total_docs_inc']
# df_vocab['avg_tfidf'] = tfs.mean(axis=0).tolist()[0]

### TF-IDF tri-gram character-level scores

In [10]:
# df['stem_concat'] = list(map(lambda x: ' '.join(x), df['stem']))
unique_words = list(set([item for sublist in df.clean for item in sublist]))
tf_idf_c = TfidfVectorizer(analyzer='char', ngram_range=(1,3)) # norm=None # IF WANT ACTUAL WORD COUNTS, RATHER THAN L2
tf_idf_c_m = tf_idf_c.fit_transform(unique_words)

In [11]:
def tfidf_c_cosim_function(tf_idf_fit, tf_idf_matrix, vectorized_string = 'steril'):
    vectorized_string = tf_idf_fit.transform([vectorized_string])
    cos_sim_matrix = tf_idf_matrix.multiply(vectorized_string).sum(axis = 1)
    X = {'return_string': unique_words[cos_sim_matrix.argmax()], 'return_cosine_similarity': cos_sim_matrix.max()}
    return(X)

In [12]:
# print(any("taus-29" == s for s in unique_words))
# print(any("tau" == s for s in unique_words))
# tfidf_c_cosim_function(tf_idf_c,tf_idf_c_m,'taus-29')

### Word-2-Vec Cosine-similarity scores

In [13]:
# VECTORIZE WORDS USING WORD-2-VEC
# model = w2v(df['clean'], size=100, window=5, min_count=1, workers=4)
model = w2v(df['stem'], size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv

### Define function to return top X Word-2-Vec cosine similarities and TF-IDF unigram scores

In [14]:
# matching = [s for s in df.abstract if "conducta" in s]

In [15]:
# # DEFINE FUNCTION
# def search_similar(word = 'practicable', number_matches = 10):
#     df_sim = pd.DataFrame(columns = ['search_word', 'search_stem', 'return_stem', 'return_word', 'cosine_similarity', 'total_docs_inc', 'total_tf', 'avg_nonzero_tf', 'idf', 'avg_nonzero_tfidf'])
#     word_to_search = stem_to_word[word]
#     similarity = dict(model.wv.similar_by_vector(word_to_search, topn = number_matches))
#     for i in range(number_matches):
#         token_to_search = list(similarity.keys())[i]
#         X = [list(stem_to_word.keys())[list(stem_to_word.values()).index(token_to_search)]]
#         df_sim.loc[i, ['search_word', 'search_stem', 'return_stem', 'return_word', 'cosine_similarity']] = [word, word_to_search, token_to_search, X[0], list(similarity.values())[i]]
#         df_sim.loc[i,list(df_vocab.columns)] = df_vocab.loc[df_vocab.index == token_to_search,:].values
#     # # X = [list(stem_to_word.keys())[list(stem_to_word.values()).index(token_to_search)]]
#     # # X
#     return(df_sim)

In [16]:
# search_similar('tau', number_matches = 10)

In [17]:
def search_nonexact_similar(word = 'taus-29', number_matches = 10):
    df_sim = pd.DataFrame(columns = ['search_word', 'best_match', 'best_match_tfidf_cosine_similarity', 'search_stem', 'return_stem', 'return_word', 'w2v_cosine_similarity', 'total_docs_inc', 'total_tf', 'avg_nonzero_tf', 'idf', 'avg_nonzero_tfidf'])
    best_match = tfidf_c_cosim_function(tf_idf_c,tf_idf_c_m, word)
    word_to_search = stem_to_word[best_match['return_string']]
    similarity = dict(model.wv.similar_by_vector(word_to_search, topn = number_matches))
    for i in range(number_matches):
        token_to_search = list(similarity.keys())[i]
        X = [list(stem_to_word.keys())[list(stem_to_word.values()).index(token_to_search)]]
        df_sim.loc[i, ['search_word'
                       , 'best_match', 'best_match_tfidf_cosine_similarity'
                       , 'search_stem', 'return_stem', 'return_word'
                       , 'w2v_cosine_similarity']] = \
        [word, best_match['return_string'], best_match['return_cosine_similarity'], word_to_search, token_to_search, X[0], list(similarity.values())[i]]
        df_sim.loc[i,list(df_vocab.columns)] = df_vocab.loc[df_vocab.index == token_to_search,:].values
    # # X = [list(stem_to_word.keys())[list(stem_to_word.values()).index(token_to_search)]]
    # # X
    return(df_sim)

In [18]:
search_nonexact_similar('corisol', number_matches = 10)

Unnamed: 0,search_word,best_match,best_match_tfidf_cosine_similarity,search_stem,return_stem,return_word,w2v_cosine_similarity,total_docs_inc,total_tf,avg_nonzero_tf,idf,avg_nonzero_tfidf
0,corisol,cortisol,0.674237,cortisol,salivari,salivary,0.761935,54,1.09744,0.020323,6.93133,0.140866
1,corisol,cortisol,0.674237,cortisol,corticosteron,corticosterone,0.730851,81,1.28534,0.0158684,6.53194,0.103651
2,corisol,cortisol,0.674237,cortisol,progesteron,progesteron,0.712441,31,0.782391,0.0252384,7.47293,0.188605
3,corisol,cortisol,0.674237,cortisol,cort,cort,0.703391,25,0.934951,0.037398,7.68057,0.287238
4,corisol,cortisol,0.674237,cortisol,tsst,tsst,0.671725,10,0.305349,0.0305349,8.54077,0.260791
5,corisol,cortisol,0.674237,cortisol,diurnal,diurnal,0.658733,51,0.938129,0.0183947,6.98742,0.128531
6,corisol,cortisol,0.674237,cortisol,acth,acth,0.652056,13,0.347264,0.0267126,8.2996,0.221704
7,corisol,cortisol,0.674237,cortisol,prolactin,prolactin,0.650878,23,0.423214,0.0184006,7.76061,0.1428
8,corisol,cortisol,0.674237,cortisol,testosteron,testosterone,0.646234,38,0.842454,0.0221699,7.2751,0.161288
9,corisol,cortisol,0.674237,cortisol,crp,crp,0.638096,35,1.04294,0.0297984,7.35514,0.219172


# Embedding

### TF-IDF (sklearn)

In [4]:
# should create a custom class for tf-idf
tfidf = TfidfVectorizer(analyzer='word'
                        , lowercase=True
                        , strip_accents='unicode'
                        , ngram_range=(1,3)
                        , norm = 'l2'
                        , use_idf=True
                        , smooth_idf = True)
tfs = tfidf.fit_transform(corpus_concat)

In [None]:
# from gensim.test.utils import common_corpus, common_dictionary
# from gensim.sklearn_api import TfIdfTransformer

# model = TfIdfTransformer(dictionary=common_dictionary)
# tfidf_corpus = model.fit_transform(common_corpus)

In [None]:
# from gensim.models.phrases import Phrases
# bigram = Phrases(common_texts)
# common_texts = [bigram[line] for line in common_texts]
# trigram = Phrases(common_texts)
# common_texts = [trigram[line] for line in common_texts]

# # trigram = Phrases(bigram.vocab.keys(), min_count=1, threshold=1)  # train model

## Word2Vec

In [5]:
# need to tune hyperparameters for W2V model
model = w2v(corpus, size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv

In [1]:
########################################
# General:
########################################

import pandas as pd, numpy as np, os, sys, copy
from tqdm import tqdm

########################################
# String cleaning:
########################################
import unidecode, re
from nltk.stem.snowball import SnowballStemmer

########################################
# For the custom pipeline architecture:
########################################
import itertools
# from itertools import chain, itertools
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin # to define class for use in pipeline

##########
# Tokenisation
##########
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 

##########
# Dimensionality reduction
##########
from sklearn.decomposition import TruncatedSVD, NMF # PCA will not work on sparse matricies

##########
# Classification
##########
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV

##########
# Pipeline
##########
from sklearn.model_selection import GridSearchCV, StratifiedKFold

##########
# Measuring performance
##########
from sklearn.metrics import average_precision_score, make_scorer, confusion_matrix, precision_recall_fscore_support

In [8]:
model.wv.similar_by_vector('alzheimer', topn = 10)

[('parkinson', 0.7690507173538208),
 ('ad', 0.6871845126152039),
 ('neuropathology', 0.6836362481117249),
 ('huntington', 0.6725811958312988),
 ('prodromal', 0.6661573648452759),
 ('pathology', 0.6650171279907227),
 ('creutzfeldt', 0.6578435897827148),
 ('dementia', 0.6444272994995117),
 ('neuropathological', 0.6371122002601624),
 ('amyloidosis', 0.6302556395530701)]

# EMBEDDING

## TF-IDF

In [4]:
class custom_tfidf(BaseEstimator, TransformerMixin):
    
    def __init__(self
                 , X
                 , string_delimeter = ' ~~ '
                 , ngram_range= (1,2)
                 , min_df = 1
                 , max_df = 1.0
                 , norm = 'l2'
                 , use_idf = True
                 , smooth_idf = True
                 , sublinear_tf = True):
        """
        Stems and create vocabulary based on delimeter; run text-frequency inverse-document-frequency (tf-idf) 

        :param X: array of strings
        :param string_delimeter: array of strings
        :param ngram_range: tuple of how many words to include in each tokenisation (e.g. (1,1) is unigram, (1,2) is bigram etc). See sklearn.feature_extraction.text.CountVectorizer for more details)
        :param min_df: minimum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
        :param max_df: maximum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
        :param norm: array of rx_ids
        :param use_idf: boolean, whether to use inverse-document-frequency in addition to just text-frequency  
        :param smooth_idf: boolean, whether to add one to denominator in idf step to prevent div/0 errors (see sklearn.feature_extraction.text.CountVectorizer for more details)
        :param sublinear_tf: boolean, whether to also log-transform the text-frequency step (see sklearn.feature_extraction.text.CountVectorizer for more details) 
        """
        self.train_X = X
        self.string_delim = string_delimeter
        self.ngram = ngram_range
        self.min_df = min_df
        self.max_df = max_df
        self.norm = norm
        self.use_idf = use_idf
        self.smooth_idf = smooth_idf
        self.sublinear_tf = sublinear_tf
        
    def fit(self, *_): # kwargs # fit()
        """
        Create vocabulary dictionary (prevents bi+-gramming over seperate documents)
        """
        split_strings = list(itertools.chain.from_iterable([re.split(self.string_delim, x) for x in self.train_X]))
        stemmed = clean_strings(split_strings)
        count_vec = CountVectorizer(strip_accents = 'unicode'
                                    , analyzer = 'word'
                                    , stop_words = 'english'
                                    , lowercase = True
                                    , min_df = self.min_df , max_df = self.max_df
                                    , ngram_range = self.ngram)
        vocab = count_vec.fit(stemmed)
        self.vocab = vocab.vocabulary_
        return(self)
    
    def transform(self, X): # kwargs # transform
        """
        Run tf-idf using vocab
        """        
        stemmed = clean_strings(X)
        tfidf_vec = TfidfVectorizer(strip_accents = 'unicode'
                                    , analyzer = 'word'
                                    , stop_words = 'english'
                                    , lowercase = True
                                    , min_df = self.min_df , max_df = self.max_df
                                    , ngram_range = self.ngram
                                    , vocabulary = self.vocab
                                    , norm = self.norm
                                    , use_idf = self.use_idf
                                    , smooth_idf =  self.smooth_idf
                                    , sublinear_tf = self.sublinear_tf)
        vectorized_matrix = tfidf_vec.fit_transform(stemmed)
        return(vectorized_matrix)
    
def tokenisation(train_X, validation_X
                 , string_delimeter = ' ~~ '
                 , ngram_range= (1,2)
                 , min_df = 1
                 , max_df = 1.0
                 , norm = 'l2'
                 , use_idf = True
                 , smooth_idf = True
                 , sublinear_tf = True):
    ct = custom_tfidf(train_X)
    a = ct.fit_transform(train_X)
    b = ct.transform(validation_X)
    return({'fit': ct, 'train_X': a, 'validation_X': b})    

## Dimensionality reduction

In [5]:
def dimensionality_reduction(train_X, validation_X
                             , n_comp = None):
    if n_comp == None:
        return({"train_X": train_X, "validation_X": validation_X})
    else:
        c = TruncatedSVD(n_components=n_comp).fit(train_X)
        d = c.fit_transform(train_X)
        d2 = c.transform(validation_X)
        return({'fit': c, 'train_X': d, 'validation_X': d2})

## Classification

In [6]:
def interim_results(y, y_pred):
    """
    Assess performance of y_hat vs y

    :param y: array of actual labels
    :param y_pred: array of predicted labels
    :return z: pandas DataFrame, specifying precision, recall and f1 score
    """
    z = pd.DataFrame({'class': y.sort_values().unique()
                      ,'precision': precision_recall_fscore_support(y, y_pred, warn_for = ())[0]
                      ,'recall': precision_recall_fscore_support(y, y_pred, warn_for = ())[1]
                      ,'f1_score': precision_recall_fscore_support(y, y_pred, warn_for = ())[2]                      
                     })
    return(z)    

def classification(train_X, train_y, validation_X, validation_y
                   , classifier = [LinearSVC(class_weight = 'balanced')]):
    e = classifier.fit(X = train_X, y = train_y)
    y_hat = e.predict(X = validation_X)
    results = interim_results(validation_y, y_hat)
    mean_f1_score = np.mean(results['f1_score'])
    return({'fit': e, 'prediction': y_hat, 'results': results, 'mean_f1_score': mean_f1_score})    

## Pipeline architecture

In [7]:
def cross_val(X, y, n_splits, shuffle = False):
    splits = dict()
    counter = 0
    skf = StratifiedKFold(n_splits=n_splits)
    for train_index, cv_index in skf.split(X = X, y = y):
        train_X = X.iloc[train_index,].copy()
        train_y = y.iloc[train_index,].copy()
        validation_X = X.iloc[cv_index,].copy()
        validation_y = y.iloc[cv_index,].copy()
        splits[counter] = {'train_X': train_X, 'train_y': train_y, 
                           'validation_X': validation_X, 'validation_y': validation_y}
        counter += 1
    return(splits)

def custom_text_class_pipeline(X
                               , y
                               , cv_splits=2
                               , ngram = [(1,2)]
                               , min_df = [1], max_df = [1.0]
                               , norm = ['l2']
                               , use_idf = [True]
                               , dim_reduc = [50]
                               , classifiers = [LinearSVC(class_weight = 'balanced')]):
    """
    Run pipeline

    :param X: array of strings to train on
    :param y: array of correct labels
    :param cv_splits: number of cross-validation splits to run
    :param ngram_range: tuple of how many words to include in each tokenisation (e.g. (1,1) is unigram, (1,2) is bigram etc). See sklearn.feature_extraction.text.CountVectorizer for more details)
    :param min_df: minimum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
    :param max_df: maximum # documents a word must be found in to include the word in the dictionary (see sklearn.feature_extraction.text.CountVectorizer for more details)
    :param norm: array of rx_ids
    :param use_idf: boolean, whether to use inverse-document-frequency in addition to just text-frequency  
    :param smooth_idf: boolean, whether to add one to denominator in idf step to prevent div/0 errors (see sklearn.feature_extraction.text.CountVectorizer for more details)
    :param sublinear_tf: boolean, whether to also log-transform the text-frequency step (see sklearn.feature_extraction.text.CountVectorizer for more details) 
    """
    params = {'ngram': ngram, 'min_df': min_df, 'max_df': max_df, 'norm': norm, 'use_idf': use_idf}
    params = pd.DataFrame(list(itertools.product(*params.values())), columns = params.keys())
    
    cross_validation = cross_val(X, y, n_splits=cv_splits)
    
    pbar = tqdm(total=params.shape[0]*len(dim_reduc)*len(classifiers)*cv_splits)
    
    all_results = pd.DataFrame()
    # all_results = pd.DataFrame({ 'k-fold': [""], 'tf-idf': [""], 'tf-idf parameters':[""], 'dimensionality reduction': [""], 'dim_reduc paramters': [""], 'classifier': ""}, )

    for k in cross_validation:
        for row in range(params.shape[0]):
            X = tokenisation(cross_validation[k]['train_X']
                             , cross_validation[k]['validation_X'] 
                             , ngram_range = params['ngram'][row]
                             , min_df = params['min_df'][row]
                             , max_df = params['max_df'][row]
                             , norm = params['norm'][row]
                             , use_idf = params['use_idf'][row])
            for comp in dim_reduc:
                X2 = dimensionality_reduction(X['train_X'], X['validation_X']
                                              , n_comp = comp)
                for j in classifiers:
                    classifier_result = classification(X2['train_X']
                                                       , cross_validation[k]['train_y']
                                                       , X2['validation_X']
                                                       , cross_validation[k]['validation_y']
                                                       , classifier = j)
                    everything = {'k-fold': k
                                  , 'tf-idf': copy.deepcopy(X['fit'])
                                  , 'tf-idf parameters': params.loc[row,:]
                                  , 'dimensionality reduction': copy.deepcopy(X2['fit'])
                                  , 'dim_reduc paramters': comp
                                  , 'classifier': copy.deepcopy(classifier_result['fit'])
                                  , 'results': classifier_result['results']
                                  , 'validation_y_hat': classifier_result['prediction']
                                  , 'train_X': cross_validation[k]['train_X'] 
                                  , 'validation_X': cross_validation[k]['validation_X'] 
                                  , 'train_y': cross_validation[k]['train_y'] 
                                  , 'validation_y': cross_validation[k]['validation_y']                                   
                                  , 'mean_f1_score': np.mean(classifier_result['mean_f1_score'])}
                    all_results = all_results.append(everything, ignore_index = True)
                    pbar.update()
    pbar.close()
    return(all_results)

def extrapolate(X, results_row):
    return(extrapolation.classifier.predict(
           extrapolation['dimensionality reduction'].transform(
           extrapolation['tf-idf'].transform(all_data.translated)
           )))

# First pass with data

In [33]:
df = pd.read_clipboard(sep = '\t')
df.fillna('', inplace=True)
df.columns = [re.sub('[^a-zA-Z0-9]+', '_', x.lower()) for x in df.columns]
df2 = df.loc[df.body != '',:]

In [34]:
df2.gene_therapy_.value_counts()

False    64
True     35
Name: gene_therapy_, dtype: int64

In [35]:
results = custom_text_class_pipeline(X = df2.body.map(str)
                                     , y = df2.gene_therapy_
                                     , cv_splits = 5
                                     , dim_reduc = [50, 100, 150])

100%|██████████| 15/15 [00:11<00:00,  1.34it/s]


In [36]:
results['mean_f1_score']

0     0.523810
1     0.561129
2     0.561129
3     0.375000
4     0.375000
5     0.375000
6     0.641577
7     0.641577
8     0.641577
9     0.943020
10    0.943020
11    0.943020
12    0.840336
13    0.834783
14    0.834783
Name: mean_f1_score, dtype: float64

In [37]:
results.loc[9,'results']

Unnamed: 0,class,f1_score,precision,recall
0,False,0.962963,0.928571,1.0
1,True,0.923077,1.0,0.857143


In [63]:
pd.concat([pd.Series(results.loc[9, 'validation_X']).reset_index(drop = True)
           , pd.Series(results.loc[9, 'validation_y']).reset_index(drop = True)
           , pd.Series(results.loc[9, 'validation_y_hat']).reset_index(drop = True)]
          , axis=1).to_csv('results_v1.csv')

In [None]:
def extrapolate(X, results_row):
    return(extrapolation.classifier.predict(
           extrapolation['dimensionality reduction'].transform(
           extrapolation['tf-idf'].transform(all_data.translated)
           )))

In [None]:
extrapolate(df.ab)