In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
from functions import *

In [6]:
# Identified using decision tree analysis in Decision Tree Analysis ipython notebook
important_columns =[
    'Hormonal Contraceptives (years)',
    'STDs:HIV',
    'Smokes (years)',
    'First sexual intercourse',
    'Smokes (packs/year)',
    'IUD (years)',
    'Dx:HPV',
    'Num of pregnancies',
    'Number of sexual partners'   
]

In [7]:
X = pd.read_csv('X_ccrf.csv')[important_columns]
X.head()

Unnamed: 0,Hormonal Contraceptives (years),STDs:HIV,Smokes (years),First sexual intercourse,Smokes (packs/year),IUD (years),Dx:HPV,Num of pregnancies,Number of sexual partners
0,0.0,0.0,0.0,15.0,0.0,0.0,0,1.0,4.0
1,0.0,0.0,0.0,14.0,0.0,0.0,0,1.0,1.0
2,0.0,0.0,0.0,?,0.0,0.0,0,1.0,1.0
3,3.0,0.0,37.0,16.0,37.0,0.0,1,4.0,5.0
4,15.0,0.0,0.0,21.0,0.0,0.0,0,4.0,3.0


In [8]:
cols_2_impute =[]
#print(X.shape[0])
for column in X.columns:
    if '?' in X[column].unique().tolist():
        cols_2_impute.append(column)

In [9]:
new_X = X.replace('?',np.nan)

In [10]:
y_temp = pd.read_csv('y_ccrf.csv')

In [11]:
y =[]
for idx in y_temp.index:
    row =[str(y_temp.loc[idx][0]),str(y_temp.loc[idx][1]),str(y_temp.loc[idx][2]),str(y_temp.loc[idx][3])]
    y.append(row)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(new_X, y_temp, test_size=0.40, random_state=101)

In [13]:
categories = y_temp.columns.values.tolist()

In [14]:
SVC_pipeline = Pipeline([
                ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
                ('scaler',StandardScaler()),
                ('clf', OneVsRestClassifier(LinearSVC(max_iter=10000))),
            ])

In [15]:
SVC_accuracy =[]
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, y_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    SVC_accuracy.append(round(accuracy_score(y_test[category], prediction),2))

... Processing Hinselmann
Test accuracy is 0.9534883720930233
... Processing Schiller
Test accuracy is 0.9040697674418605
... Processing Citology
Test accuracy is 0.9476744186046512
... Processing Biopsy
Test accuracy is 0.9331395348837209


In [16]:
LG_pipeline = Pipeline([
                ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
                ('scaler',StandardScaler()),
                ('clf', OneVsRestClassifier(LogisticRegression(max_iter=100,solver='lbfgs'))),
            ])

In [17]:
LogReg_Accuracy =[]
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LG_pipeline.fit(X_train, y_train[category])
    # compute the testing accuracy
    prediction = LG_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    LogReg_Accuracy.append(round(accuracy_score(y_test[category], prediction),2))

... Processing Hinselmann
Test accuracy is 0.9505813953488372
... Processing Schiller
Test accuracy is 0.9040697674418605
... Processing Citology
Test accuracy is 0.9447674418604651
... Processing Biopsy
Test accuracy is 0.9331395348837209


In [18]:
results = [SVC_accuracy,LogReg_Accuracy]
index = ['Linear SVC','Logistic Regression']

In [19]:
df_results = pd.DataFrame(results,index=index,columns=categories)
df_results.head()

Unnamed: 0,Hinselmann,Schiller,Citology,Biopsy
Linear SVC,0.95,0.9,0.95,0.93
Logistic Regression,0.95,0.9,0.94,0.93
