### 1.Read Input

In [1]:
import textacy

training_records = textacy.io.read_json("datasets/five_training_data.json", lines=True)

### 2. List of Text and Labels

In [2]:
X_train = []
y_train = []

for training_record in training_records:
    
    X_train.append(training_record["text"])
    y_train.append(training_record["class"])

#print(X_train)
#print(y_train)


#### 2.1 Tokening data with spaCy

In [3]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

punctuations = string.punctuation

nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    return mytokens

### 3. transforms(): Extract Noun Phrases from Text

Custom transformer function

In [4]:
from sklearn.base import TransformerMixin

class transforms(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        #for text in X:
            # print("TEXT________________________________________")
            # print(text)
        return [transform_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

def transform_text(text):
    doc = textacy.make_spacy_doc((str(text), {"class": ""}))
    nps = textacy.extract.noun_chunks(doc,
                                      drop_determiners = True,
                                      min_freq = 1)
    doc_nps = [str(np).replace(' ','_') for np in nps]
    nps = ' '.join(doc_nps) # because countvectorizer needs a string
    return (nps)

### 4.Vectorizer (Bag of Words / TfIdf)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

### 5. Classifier for Training (Logistic Regression)

In [6]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='newton-cg',multi_class='multinomial')

### 6. Training Pipeline

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

transforms()

pipe = Pipeline([("transformer", transforms()),
                ('vectorizer', bow_vector),
                ('classifier', classifier)])

pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('transformer', <__main__.transforms object at 0x7f6b0eb29860>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [8]:
from joblib import dump

dump(pipe, 'pipeUSECASE.joblib')

['pipeUSECASE.joblib']

## 7. Classification

Read Test data

In [9]:
test_records = textacy.io.read_json("datasets/five_test_data.json", lines=True)

X_test = []
y_test = []

for test_record in test_records:
    
    X_test.append(test_record["text"])
    y_test.append(test_record["class"])

#print(X_test)
#print(y_test)

In [10]:
from joblib import load
pipes = load('pipeUSECASE.joblib')

preds = pipes.predict(X_test)

#print("results:")
#for (sample, pred) in zip(X_test, preds):
#    print (sample, ":", pred)

#predicted = pipe.predict(X_test)

results:
 Abstract Background  Non-small cell lung cancer (NSCLC) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-CBL is an E3 ubiquitin ligase and adaptor molecule important in normal homeostasis and cancer. We determined the genetic variations of c-CBL, relationship to receptor tyrosine kinases (EGFR and MET), and functionality in NSCLC.  Methods and Findings  Using archival formalin-fixed paraffin embedded (FFPE) extracted genomic DNA, we show that c-CBL mutations occur in somatic fashion for lung cancers. c-CBL mutations were not mutually exclusive of MET or EGFR mutations; however they were independent of p53 and KRAS mutations. In normal/tumor pairwise analysis, there was significant loss of heterozygosity (LOH) for the c-CBL locus (22%, n = 8/37) and none of these samples revealed any mutation in the remaining copy of c-CBL. The c-CBL LOH also positively correlated with EGFR and MET mutations observed in the same samples. Using select 

### 8. Test fitting 

In [13]:
from sklearn import metrics

print("Logistic Regression Accuracy: ", metrics.accuracy_score(y_test, preds))
#print("Logistic Regression Precision: ", metrics.precision_score(y_test, predicted))
#print("Logistic Regression Recall: ", metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy:  0.0


***Questions***:

1. How this can be executed in HPC, e.g. training a new model for the corpus?
2. Is there a maximum lenght for the spacy object nlp?
3. Can we use the Noun Phrase to clean the text?