In [1]:
import xml.etree.cElementTree as etree
import re
from tqdm import tqdm
import os
import nltk
from bs4 import BeautifulSoup
from html2text import html2text 
import re
import numpy as np
np.random.seed(7)

In [2]:
def clean_html(html):
    """
    Copied from NLTK package.
    Remove HTML markup from the given string.

    :param html: the HTML string to be cleaned
    :type html: str
    :rtype: str
    """

    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()


In [3]:
def loadFiles(directory):
    DEBUG =False
    l = []
    for file in os.listdir(directory):
        data = directory + file
        if DEBUG : print (data)
        l.append(data)
    l=sorted(l)
    
    return l

In [4]:
cancer_file_loc = "./Dataset/train/cancer/"
non_cancer_file_loc = "./Dataset/train/noncancer/" 

In [5]:
files_cancer = loadFiles(cancer_file_loc)
files_noncancer = loadFiles(non_cancer_file_loc)

In [6]:
cancer=[]
for i in tqdm(files_cancer):
    xmlDoc = open(i, 'r')
    xmlDocData = xmlDoc.read()
    html=xmlDocData
    cleanhtml = clean_html(html)
    text = html2text(cleanhtml)
    soup = BeautifulSoup(html,"lxml")
    text2 = soup.get_text()
    cancer.append([text2,'c'])

100%|██████████| 300/300 [00:46<00:00,  6.51it/s]


In [7]:
noncancer=[]
for i in tqdm(files_noncancer):
    xmlDoc = open(i, 'r')
    xmlDocData = xmlDoc.read()
    html=xmlDocData
    cleanhtml = clean_html(html)
    text = html2text(cleanhtml)
    soup = BeautifulSoup(html,"lxml")
    text2 = soup.get_text()
    noncancer.append([text2,'nc'])

100%|██████████| 150/150 [00:13<00:00, 11.43it/s]


In [8]:
import pandas as pd

In [9]:
cnT  = pd.DataFrame(cancer,columns=['text','lable'])
NcnT = pd.DataFrame(noncancer,columns=['text','lable'])

In [10]:
cm = [cnT,NcnT]
df = pd.concat(cm,ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

In [11]:
df

Unnamed: 0,text,lable
0,Case Reports ImmunolCase Reports ImmunolCRIICa...,nc
1,Case Reports ImmunolCase Reports ImmunolCRIICa...,nc
2,Cancer Immunol ImmunotherCancer Immunol. Immun...,c
3,Cancer Biol MedCancer Biol MedCBMCancer Biolog...,c
4,Cancer ImagingCancer ImagingCancer Imaging1740...,c
5,Cancer ImagingCancer ImagingCICancer ImagingCa...,c
6,Cancer ImagingCancer ImagingCancer Imaging1740...,c
7,Clin Drug InvestigClin Drug InvestigClinical D...,nc
8,Cancer ImagingCancer ImagingCancer Imaging1740...,c
9,Cancer ImagingCICancer ImagingCancer Imaging17...,c


# SVM

In [12]:
svm_dataframe          = df
svm_dataframe['lable'] = df['lable'].map({"c":1,"nc":0})

In [13]:
svm_dataframe

Unnamed: 0,text,lable
0,Case Reports ImmunolCase Reports ImmunolCRIICa...,0
1,Case Reports ImmunolCase Reports ImmunolCRIICa...,0
2,Cancer Immunol ImmunotherCancer Immunol. Immun...,1
3,Cancer Biol MedCancer Biol MedCBMCancer Biolog...,1
4,Cancer ImagingCancer ImagingCancer Imaging1740...,1
5,Cancer ImagingCancer ImagingCICancer ImagingCa...,1
6,Cancer ImagingCancer ImagingCancer Imaging1740...,1
7,Clin Drug InvestigClin Drug InvestigClinical D...,0
8,Cancer ImagingCancer ImagingCancer Imaging1740...,1
9,Cancer ImagingCICancer ImagingCancer Imaging17...,1


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
holdtext = np.array(svm_dataframe['text'])

In [76]:
vect = CountVectorizer(max_features=5000)
vect.fit(holdtext)
simple_train_dtm = vect.transform(holdtext)
std   =  simple_train_dtm.toarray()

In [77]:
X = std
y = np.array(svm_dataframe['lable'].astype('int'))

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4)

In [79]:
from sklearn import svm

In [80]:
clf = svm.SVC()

In [81]:
clf.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [82]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 

In [83]:
result1 = clf.predict(X_test)

In [84]:
confusion_matrix(y_test,result1)

array([[ 16,  30],
       [  0, 103]])

In [85]:
accuracy_score(y_test,result1)

0.79865771812080533

# Boosted tree

In [86]:
import xgboost as xgb

In [87]:
model_xgboost = xgb.XGBClassifier()
model_xgboost.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [88]:
pred = model_xgboost.predict(X_test)
print(accuracy_score(pred,y_test))

0.993288590604


In [89]:
confusion_matrix(y_test,pred)

array([[ 45,   1],
       [  0, 103]])