## Import Google Sheet Training Data

In [1]:
## IMPORT PACKAGES

# help("modules") 
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

import pandas as pd

In [2]:
## SET ENVIRONMENT VARIABLES

# Credentials
CREDENTIALS = "credentials/credentials.json"

# API data
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']
API_SERVICE_NAME = 'sheets'
API_VERSION = 'v4'

# Google sheet data
SPREADSHEET_ID = '158iHeTBUQcVb3spEFajDxyzNV5Bz5O5Oqt0lvuCLGA8'
READ_RANGE_NAME = 'cleandata!A1:N'
WRITE_RANGE_NAME = 'parseddata!A2:Q'

In [3]:
## SET AUTHENTICATION FUNCTION

def get_authenticated_service(secret_file = CREDENTIALS
                              , scopes = SCOPES
                              , api_service_name = API_SERVICE_NAME
                              , api_version = API_VERSION):
    flow = InstalledAppFlow.from_client_secrets_file(secret_file, scopes)
    credentials = flow.run_console()
    return build(api_service_name, api_version, credentials = credentials)

In [4]:
service = get_authenticated_service()

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=941605798195-1aa5774dsksops5hkpd6scvmrsb2pveu.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets&state=fWalDK8XQz31pbCC14A024QJG877DE&prompt=consent&access_type=offline&code_challenge=wKouLqXWylnQ7gysnuplbxweTzVS549KMV7q80WnLCA&code_challenge_method=S256
Enter the authorization code: 4/qQGzy5sZCUg0yNDYdY7Cxqg6_BWcA4gnnsu71nnfnGFU0eCI3S8IQ4o


In [5]:
## IMPORT DATA
result = service.spreadsheets().values().get(spreadsheetId=SPREADSHEET_ID
                                             , range=READ_RANGE_NAME).execute()
values = result.get('values', [])

In [6]:
df = pd.DataFrame(values)
df.columns = values[0]
df = df.iloc[1:]
df = df.fillna('')

## Setup training

### Text cleaning

In [8]:
########################################
# String cleaning:
########################################
import unidecode, re, time
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [9]:
########################################
## Clean text strings function
########################################

def remove_short_strings(X, max_characters = 2, lower_case_only = True):
    """
    Remove 1-2 letter words in list
    :param X: List of raw strings
    :param max_characters: Maximum size of string to remove
    :return X: List of cleaned strings
    """    
    if lower_case_only:
        regex_string = r'\b[a-z]{1,%s}\b' % (max_characters)    
    else:
        regex_string = r'\b\w{1,%s}\b' % (max_characters)
    X = list(map(lambda x: re.sub(regex_string,' ', x), X)) # remove 1-2 letter words 
    return(X)

def to_lower(X):
    """
    Set all letters to lowercase
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: x.lower(), X))
    return(X)

def to_latin(X):
    """
    Remove non-European characters whilst keeping accented european characters in list
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: x.encode("latin1", errors="ignore").decode('latin1'), X))
    return(X)

def replace_accents(X):
    """
    Replace accented characters with non-accented characters in list
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: unidecode.unidecode(x), X))
    return(X)

def remove_punctuation_regex(remove_tildas = True, remove_numerics = False):
    """
    Write regex to replace all non-alphanumeric characters, replacing them with a space.
    Option as to whether to remove tildas (~) or numerical values not.
    :param remove_tildas: Boolean, whether to remove tildas or not
    :param remove_numerics: Boolean, whether to remove numerics or not    
    :return X: Regex to do this
    """    
    if remove_tildas and remove_numerics:
        regex_string = r'[^a-zA-Z\s]' # replace all non-alphabet characters with a space
    elif remove_tildas:
        regex_string = r'[^\w\s]' # replace all non-alphanumeric characters with a space
    elif remove_numerics:
        regex_string = r'[^a-zA-Z\s\~]' # replace all non-alphabet characters except tildas with a space        
    else:
        regex_string = r'[^\w\s\~]' # replace all non-alphanumeric characters except tildas with a space
    return(regex_string)

def remove_punctuation(X, regex_string):
    """
    Replace all non-alphanumeric characters in a list of strings, replacing them with a space. 
    Option as to whether to remove tildas (~) or not.
    :param X: List of raw strings
    :param remove_tildas: Boolean, whether to remove tildas or not
    :return X: List of cleaned strings
    """    
    X = list(map(lambda x: re.sub(regex_string,' ', x), X)) 
    return(X)

def tokenise(X, delimeter = None):
    """
    Returns list of lists of strings split by the delimeter
    :param X: List of strings
    :param delimeter: Delimeter to split by
    :return X: List of list of strings
    """
    X = list(map(lambda x: x.split(sep = delimeter), X))
    return(X)

def remove_stopwords(X, stop_words = stopwords.words('english')):
    """
    Returns list of lists of strings split by the delimeter
    :param X: List of strings
    :param delimeter: Delimeter to split by
    :return X: List of list of strings
    """
    # X = list(map(lambda x: [w if w not in stop_words else None for w in x], X))
    X = list(map(lambda x: [w for w in x if w not in stop_words ], X))
    return(X)
    # https://stackoverflow.com/questions/4260280/if-else-in-a-list-comprehension

def stem_strings(X):
    """
    Stems words (shorten algorithmically) them (as defined by SnowballStemmer)
    :param X: List of raw strings
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: SnowballStemmer("english", ignore_stopwords=False).stem(x), X))
    return(X)  

def lemmatize_strings(X, pos = "v"):
    """
    Lemmatize list of strings (as defined by WordNetLemmatizer)
    :param X: List of raw strings
    :param pos: Pos parameter to feed into WordNetLemmatizer().lemmatize function
    :return X: List of cleaned strings
    """
    X = list(map(lambda x: WordNetLemmatizer().lemmatize(x, pos=pos), X))
    return(X)  

def clean_strings(X
                  , remove_short_str_max_char = 2
                  , to_lower_str = True
                  , to_latin_str = True
                  , replace_accents_str = True
                  , regex_string =  r'[^a-zA-Z\s-]' # remove_punctuation_regex(True, False)
                  , tokenise_delimeter = None
                  , stop_words = ''
                  , stemming_str = False
                  , lemma_str = False
                  , lemma_pos = "v"
                  , verbose = False):
    """
    Combination of functions for a list of strings: see parameters
    - Replaces non-alpha-numeric characters with whitespace
    - Remove english stopwords from and strings and stems them (as defined by SnowballStemmer)
    - Lemmatizes english strings (as defined by WordNetLemmatizer) 
    :param X: List of strings
    :param remove_short_str: Numeric, size of small words to remove (if set to 0, no words are removed)
    :param to_latin_str: Boolean, whether to remove non-European characters whilst keeping accented european characters from pandas column
    :param replace_accents_str: Boolean, whether to replace accented characters with non-accented characters
    :param regex_string: String, can add extra regex to find other characters to remove
    :param tokenise_delimeter: String, determines how to split into tokens. Default = None splits by all whitespace
    :param stop_words: List of stopwords to remove from the tokens
    :param stemming_str: Boolean, whether to stem the words or not (do not use before translating) (as defined by SnowballStemmer)
    :param lemma_str: Boolean, whether to lemmatize the words or not (do not use before translating) (as defined by WordNetLemmatizer)
    :param lemma_pos: String, pos parameter to feed into WordNetLemmatizer().lemmatize function
    :param verbose: whether to print when it finishes/comments
    :return X: Dataframe of labelled data
    """
    if remove_short_str_max_char > 0:
        X = remove_short_strings(X, remove_short_str_max_char) # remove 1-2 letter words     
    if to_lower_str: # remove chinese characters, keep accented european characters
        X = to_lower(X)     
    if to_latin_str: # remove chinese characters, keep accented european characters
        X = to_latin(X) 
    if replace_accents_str: # replace accented characters with non-accented characters
        X = replace_accents(X) 
    X = remove_punctuation(X, regex_string)
    X = tokenise(X, tokenise_delimeter)
    X = remove_stopwords(X, stop_words)
    if stemming_str:
        X = list(map(stem_strings, X)) # remove English stopwords from string (as defined by SnowballStemmer)
    if lemma_str:
        X = list(map(lambda x: lemmatize_strings(x, lemma_pos), X)) # remove English stopwords from string (as defined by SnowballStemmer)        
    if verbose:
        print(time.strftime('%d/%m/%Y %H:%M:%S') + ' Abstract strings cleaned')
    return(X)

# https://chrisalbon.com/machine_learning/preprocessing_text/remove_stop_words/

In [10]:
df['clean'] = clean_strings(df.loc[:, 'abstract']
                            , regex_string='[^a-zA-Z\\\\s]'
                            , stop_words = stopwords.words('english')
                            , verbose = True)
df['stem'] = list(map(stem_strings, df['clean']))

24/08/2019 12:16:45 Abstract strings cleaned


In [11]:
# Remove abstracts with only one word (or less) in
df = df.loc[list(map(lambda x: x > 2, list(map(len, df['stem'])))),:].reset_index()

In [111]:
# # WRITE DATA BACK
# df2 = df
# df2['clean'] = list(map(lambda x: ' '.join(x), df2['clean']))
# df2['stem'] = list(map(lambda x: ' '.join(x), df2['stem']))

# service.spreadsheets().values().update(spreadsheetId=SPREADSHEET_ID
#                                        , range = WRITE_RANGE_NAME
#                                        , valueInputOption = 'RAW'
#                                        , body={'values': df2.values.tolist()}).execute()

## Unsupervised structuring

In [12]:
import numpy as np

from scipy.sparse import csr_matrix, hstack

from gensim.models.doc2vec import TaggedDocument as td
from gensim.models import Doc2Vec as d2v, FastText as ft #, Word2Vec as w2v, phrases as bigram

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import TruncatedSVD, NMF # PCA will not work on sparse matricies

from sklearn.base import BaseEstimator, TransformerMixin

In [13]:
class d2v_structure(BaseEstimator, TransformerMixin):

    """Create D2V vectorized features from text"""
    
#     def __init__(self, X):
#         self.X = X
    
    def fit(self, X, *_, **args):
        tagged_docs = list(map(lambda i, line: td(line, [i])
                       , list(range(len(X)))
                       , X))
        self.d2v_dm = d2v(tagged_docs, **args) # vector_size=100, window=5, min_count=1, workers=4, dm=1
        return self

    def transform(self, X, *_):
        d2v_dm_m = [self.d2v_dm.infer_vector(x) for x in X]
        return(csr_matrix(d2v_dm_m))

In [14]:
class tfidf_tsvd_structure(BaseEstimator, TransformerMixin):

    """Create TF_IDF vectorized features from text"""
    
    def fit(self, X, n_components = None, *_, **args):
        self.tf_idf = TfidfVectorizer(**args).fit(X) # norm='l2', ngram_range=(1, 2)
        self.n_components = n_components
        if (n_components != None):
            self.t_m = TruncatedSVD(n_components).fit(self.tf_idf.transform(X))
        return self

    def transform(self, X, *_):
        if (self.n_components != None):
            t_m = csr_matrix(self.t_m.transform(self.tf_idf.transform(X)))
        else:
            t_m = csr_matrix(self.tf_idf.transform(X))
        return(t_m)

In [16]:
X = list(df['stem'])

d2v_f1 = d2v_structure().fit(X = X, vector_size=150, window=5)
d2v_f2 = d2v_structure().fit(X = X, vector_size=150, window=10)
d2v_t1 = d2v_f1.transform(X = X)
d2v_t2 = d2v_f2.transform(X = X)

tfidf_f = tfidf_tsvd_structure().fit(X, n_components = 200, preprocessor = ' '.join)
tfidf_t = tfidf_f.transform(X = X)

In [17]:
training_data = hstack([d2v_t1, d2v_t2,tfidf_t]).toarray()
training_data.shape

(434, 500)

## Supervised learning

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support

In [None]:
def classification(train_X, train_y, validation_X, validation_y
                   , classifier = [LinearSVC(class_weight = 'balanced')]):
    e = classifier.fit(X = train_X, y = train_y)
    y_hat = e.predict(X = validation_X)
    results = interim_results(validation_y, y_hat)
    mean_f1_score = np.mean(results['f1_score'])
    return({'fit': e, 'prediction': y_hat, 'results': results, 'mean_f1_score': mean_f1_score})    

In [None]:
#

In [124]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [125]:
def cross_val(X, y, n_splits):
    splits = dict()
    counter = 0
    skf = StratifiedKFold(n_splits=n_splits)
    for train_index, cv_index in skf.split(X = X, y = y):
        train_X = X[train_index].copy()
        train_y = y[train_index].copy()
        validation_X = X[cv_index].copy()
        validation_y = y[cv_index].copy()
        splits[counter] = {'train_X': train_X, 'train_y': train_y, 
                           'validation_X': validation_X, 'validation_y': validation_y}
        counter += 1
    return(splits)

def interim_results(y, y_pred):
    """
    Assess performance of y_hat vs y

    :param y: array of actual labels
    :param y_pred: array of predicted labels
    :return z: pandas DataFrame, specifying precision, recall and f1 score
    """
    z = pd.DataFrame({'class': np.unique(y)
                      ,'precision': precision_recall_fscore_support(y, y_pred, warn_for = ())[0]
                      ,'recall': precision_recall_fscore_support(y, y_pred, warn_for = ())[1]
                      ,'f1_score': precision_recall_fscore_support(y, y_pred, warn_for = ())[2]                      
                     })
    return(z)    



In [126]:
a = cross_val(X = training_data
              , y = np.array((df['neuro'] == 'Y')*1)
              , n_splits = 5)

In [127]:
results = classification(train_X = a[0]['train_X']
                         , train_y = a[0]['train_y']
                         , validation_X = a[0]['validation_X']
                         , validation_y = a[0]['validation_y']
                         , classifier = LinearSVC(class_weight = 'balanced'))

In [129]:
results['results']

Unnamed: 0,class,precision,recall,f1_score
0,0,0.736842,0.5,0.595745
1,1,0.730769,0.883721,0.8


## Pipeline

In [None]:
def single_pass(train_X, train_y, validation_X, validation_y, **kwargs):
    