In [92]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

In [2]:
mut_raw = pd.read_csv('data/TCGA.LUAD.mutations.txt', delimiter='\t')
exp_raw = pd.read_csv('data/TCGA.LUAD.expression.txt', delimiter='\t')
meta_raw = pd.read_csv('data/TCGA.LUAD.metadata.txt', delimiter='\t')
clinical = pd.read_csv('data/clinical.txt', delimiter='\t')
exposure = pd.read_csv('data/exposure.txt', delimiter='\t')

In [4]:
# exp_raw

In [5]:
# meta_raw

In [6]:
labels_raw = pd.read_csv('labels.csv')

labels_raw

Unnamed: 0,patient_id,label
0,TCGA-05-4244,0
1,TCGA-05-4249,1
2,TCGA-05-4382,0
3,TCGA-05-4384,0
4,TCGA-05-4389,1
...,...,...
402,TCGA-NJ-A55O,0
403,TCGA-NJ-A55R,0
404,TCGA-NJ-A7XG,0
405,TCGA-O1-A52J,1


In [7]:
# meta_raw

In [8]:
labels_raw = pd.read_csv('labels.csv')
labels_raw

Unnamed: 0,patient_id,label
0,TCGA-05-4244,0
1,TCGA-05-4249,1
2,TCGA-05-4382,0
3,TCGA-05-4384,0
4,TCGA-05-4389,1
...,...,...
402,TCGA-NJ-A55O,0
403,TCGA-NJ-A55R,0
404,TCGA-NJ-A7XG,0
405,TCGA-O1-A52J,1


The 0 means negative and the 1 means positive in the labels 

## Separate the data in testing and training 

In [9]:
prog = pd.read_csv('prognosis_genes.txt')
prog

Unnamed: 0,GENES
0,BAG1
1,CASP4
2,FADD
3,P63
4,5T4
...,...
93,KIAA0084
94,KIAA0153
95,KIAA0263
96,KIAA0317


In [10]:
merge = pd.merge(mut_raw, labels_raw, on='patient_id')

In [11]:
len(list(merge['patient_id'].unique()))

407

## Merging the labels with the meta data

In [12]:
metaMerge = pd.merge(labels_raw, meta_raw, on='patient_id')
metaMerge = metaMerge.rename(columns={'Unnamed: 0':'Index'})
# list(metaMerge)

In [13]:
metadata_final = pd.read_csv('data/metadata_final.csv')
metadata_final

Unnamed: 0,patient_id,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time
0,TCGA-05-4244,70.0,MALE,Late,True,0,0.0
1,TCGA-05-4249,67.0,MALE,Early,True,0,1523.0
2,TCGA-05-4250,79.0,FEMALE,Late,True,1,121.0
3,TCGA-05-4382,68.0,MALE,Early,True,0,607.0
4,TCGA-05-4384,66.0,MALE,Late,True,0,426.0
...,...,...,...,...,...,...,...
507,TCGA-NJ-A55O,56.0,FEMALE,Early,True,0,13.0
508,TCGA-NJ-A55R,67.0,MALE,Early,False,0,603.0
509,TCGA-NJ-A7XG,49.0,MALE,Late,False,0,617.0
510,TCGA-O1-A52J,74.0,FEMALE,Early,True,1,1798.0


In [14]:
mergedSmoker = pd.merge(labels_raw, metadata_final, on='patient_id')
mergedSmoker = mergedSmoker.rename(columns={'Unnamed: 0':'index'})

In [15]:
mergedSmoker

Unnamed: 0,patient_id,label,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time
0,TCGA-05-4244,0,70.0,MALE,Late,True,0,0.0
1,TCGA-05-4249,1,67.0,MALE,Early,True,0,1523.0
2,TCGA-05-4382,0,68.0,MALE,Early,True,0,607.0
3,TCGA-05-4384,0,66.0,MALE,Late,True,0,426.0
4,TCGA-05-4389,1,70.0,MALE,Early,True,0,1369.0
...,...,...,...,...,...,...,...,...
402,TCGA-NJ-A55O,0,56.0,FEMALE,Early,True,0,13.0
403,TCGA-NJ-A55R,0,67.0,MALE,Early,False,0,603.0
404,TCGA-NJ-A7XG,0,49.0,MALE,Late,False,0,617.0
405,TCGA-O1-A52J,1,74.0,FEMALE,Early,True,1,1798.0


In [16]:
# Getting the features 
X = mergedSmoker[['is_smoker', 'gender']]

# labels
y = mergedSmoker['label']

In [17]:
smoker = mergedSmoker.copy()

In [18]:
# changing the gender to binary male=0, female=1
mergedSmoker['gender'] = mergedSmoker['gender'].replace({'MALE': 0, 'FEMALE': 1})

mergedSmoker

Unnamed: 0,patient_id,label,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time
0,TCGA-05-4244,0,70.0,0,Late,True,0,0.0
1,TCGA-05-4249,1,67.0,0,Early,True,0,1523.0
2,TCGA-05-4382,0,68.0,0,Early,True,0,607.0
3,TCGA-05-4384,0,66.0,0,Late,True,0,426.0
4,TCGA-05-4389,1,70.0,0,Early,True,0,1369.0
...,...,...,...,...,...,...,...,...
402,TCGA-NJ-A55O,0,56.0,1,Early,True,0,13.0
403,TCGA-NJ-A55R,0,67.0,0,Early,False,0,603.0
404,TCGA-NJ-A7XG,0,49.0,0,Late,False,0,617.0
405,TCGA-O1-A52J,1,74.0,1,Early,True,1,1798.0


In [19]:
# getting the features for the model
X = mergedSmoker[['age_at_initial_pathologic_diagnosis', 'gender', 'is_smoker']]

# filling any nans with 0
X['age_at_initial_pathologic_diagnosis'] = X['age_at_initial_pathologic_diagnosis'].fillna(0)

# getting the labels for the model
y = mergedSmoker['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age_at_initial_pathologic_diagnosis'] = X['age_at_initial_pathologic_diagnosis'].fillna(0)


In [20]:
# checking if there are any nans
X.isna().any()

age_at_initial_pathologic_diagnosis    False
gender                                 False
is_smoker                              False
dtype: bool

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
print(len(X_train))
print(len(X_test))

325
82


In [23]:
classifier = svm.SVC(kernel='linear')
classi = classifier.fit(X_train, y_train)

In [24]:
y_pred = classifier.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5121951219512195


### ^^ This is not that meaningful because we need more features for the model to predict whether this person will live or not 

make the other features 0 and 1 for the binary aspects

also we do not need the patient_id for these 

In [26]:
# list(mut_raw)

In [27]:
# isSmoker/NonSmoker comparison

In [28]:
metadata_final = pd.read_csv('data/metadata_final.csv')
metadata_final

Unnamed: 0,patient_id,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time
0,TCGA-05-4244,70.0,MALE,Late,True,0,0.0
1,TCGA-05-4249,67.0,MALE,Early,True,0,1523.0
2,TCGA-05-4250,79.0,FEMALE,Late,True,1,121.0
3,TCGA-05-4382,68.0,MALE,Early,True,0,607.0
4,TCGA-05-4384,66.0,MALE,Late,True,0,426.0
...,...,...,...,...,...,...,...
507,TCGA-NJ-A55O,56.0,FEMALE,Early,True,0,13.0
508,TCGA-NJ-A55R,67.0,MALE,Early,False,0,603.0
509,TCGA-NJ-A7XG,49.0,MALE,Late,False,0,617.0
510,TCGA-O1-A52J,74.0,FEMALE,Early,True,1,1798.0


In [29]:
mergedSmoker = pd.merge(labels_raw, metadata_final, on='patient_id')
mergedSmoker = mergedSmoker.rename(columns={'Unnamed: 0':'index'})

In [30]:
mergedSmoker

Unnamed: 0,patient_id,label,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time
0,TCGA-05-4244,0,70.0,MALE,Late,True,0,0.0
1,TCGA-05-4249,1,67.0,MALE,Early,True,0,1523.0
2,TCGA-05-4382,0,68.0,MALE,Early,True,0,607.0
3,TCGA-05-4384,0,66.0,MALE,Late,True,0,426.0
4,TCGA-05-4389,1,70.0,MALE,Early,True,0,1369.0
...,...,...,...,...,...,...,...,...
402,TCGA-NJ-A55O,0,56.0,FEMALE,Early,True,0,13.0
403,TCGA-NJ-A55R,0,67.0,MALE,Early,False,0,603.0
404,TCGA-NJ-A7XG,0,49.0,MALE,Late,False,0,617.0
405,TCGA-O1-A52J,1,74.0,FEMALE,Early,True,1,1798.0


In [31]:
# Getting the features 
X = mergedSmoker[['is_smoker', 'gender']]

# labels
y = mergedSmoker['label']

In [32]:
smoker = mergedSmoker.copy()

In [33]:
# changing the gender to binary male=0, female=1
mergedSmoker['gender'] = mergedSmoker['gender'].replace({'MALE': 0, 'FEMALE': 1})
mergedSmoker

Unnamed: 0,patient_id,label,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time
0,TCGA-05-4244,0,70.0,0,Late,True,0,0.0
1,TCGA-05-4249,1,67.0,0,Early,True,0,1523.0
2,TCGA-05-4382,0,68.0,0,Early,True,0,607.0
3,TCGA-05-4384,0,66.0,0,Late,True,0,426.0
4,TCGA-05-4389,1,70.0,0,Early,True,0,1369.0
...,...,...,...,...,...,...,...,...
402,TCGA-NJ-A55O,0,56.0,1,Early,True,0,13.0
403,TCGA-NJ-A55R,0,67.0,0,Early,False,0,603.0
404,TCGA-NJ-A7XG,0,49.0,0,Late,False,0,617.0
405,TCGA-O1-A52J,1,74.0,1,Early,True,1,1798.0


In [34]:
# getting the features for the model
X = mergedSmoker[['age_at_initial_pathologic_diagnosis', 'gender', 'is_smoker']]

# filling any nans with 0
X['age_at_initial_pathologic_diagnosis'] = X['age_at_initial_pathologic_diagnosis'].fillna(0)

# getting the labels for the model
y = mergedSmoker['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age_at_initial_pathologic_diagnosis'] = X['age_at_initial_pathologic_diagnosis'].fillna(0)


In [35]:
# checking if there are any nans
X.isna().any()

age_at_initial_pathologic_diagnosis    False
gender                                 False
is_smoker                              False
dtype: bool

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
print(len(X_train))
print(len(X_test))

325
82


In [38]:
classifier = svm.SVC(kernel='linear')
classi = classifier.fit(X_train, y_train)

In [39]:
# getting training accuracy
train_accuracy = classi.score(X_train, y_train)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.56


In [40]:
y_pred = classifier.predict(X_test)

In [41]:
# getting testing accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5121951219512195


### ^^ This is not that meaningful because we need more features for the model to predict whether this person will live or not 

make the other features 0 and 1 for the binary aspects

also we do not need the patient_id for these 

get rid of os time because it is giving the answers essentially

## ----------------------------------------------------------------------------------------------------------------------

# Comparing the gender, is smoker, gene pivot table,and age

In [43]:
meta_final_merged = mergedSmoker.copy()
mergedMut_raw = pd.read_csv('data/merged_mutations.csv')

mergedMetaMut = pd.merge(meta_final_merged, mergedMut_raw, on='patient_id')

In [44]:
# getting rid of the OS and os.time columns
# mergedMetaMut = mergedMetaMut.drop(columns=['OS', 'OS.time'])
mergedMetaMut

Unnamed: 0,patient_id,label,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,OS,OS.time,A1BG,A1CF,...,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,hsa-mir-490,hsa-mir-6080,snoU13
0,TCGA-05-4244,0,70.0,0,Late,True,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-05-4249,1,67.0,0,Early,True,0,1523.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TCGA-05-4382,0,68.0,0,Early,True,0,607.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-05-4384,0,66.0,0,Late,True,0,426.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-05-4389,1,70.0,0,Early,True,0,1369.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,TCGA-NJ-A55O,0,56.0,1,Early,True,0,13.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
403,TCGA-NJ-A55R,0,67.0,0,Early,False,0,603.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,TCGA-NJ-A7XG,0,49.0,0,Late,False,0,617.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
405,TCGA-O1-A52J,1,74.0,1,Early,True,1,1798.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
list(mergedMetaMut['tumor_stage'].unique())

['Late', 'Early', nan]

In [46]:
# changing the tumor stage to binary early=0 late=1
mergedMetaMut['tumor_stage'] = mergedMetaMut['tumor_stage'].replace({'Early': 0, 'Late': 1})

# filling the nans
mergedMetaMut['age_at_initial_pathologic_diagnosis'] = mergedMetaMut['age_at_initial_pathologic_diagnosis'].fillna(0)
mergedMetaMut['tumor_stage'] = mergedMetaMut['tumor_stage'].fillna(0)

In [47]:
print(mergedMetaMut['age_at_initial_pathologic_diagnosis'].isna().any())
print(mergedMetaMut['tumor_stage'].isna().any())

False
False


In [48]:
mergedMetaMut = mergedMetaMut.drop(columns=['tumor_stage'])

In [49]:
mergedMetaMut.iloc[:,2:]

Unnamed: 0,age_at_initial_pathologic_diagnosis,gender,is_smoker,OS,OS.time,A1BG,A1CF,A2M,A2ML1,A3GALT2,...,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,hsa-mir-490,hsa-mir-6080,snoU13
0,70.0,0,True,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,67.0,0,True,0,1523.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,68.0,0,True,0,607.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,66.0,0,True,0,426.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,70.0,0,True,0,1369.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,56.0,1,True,0,13.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
403,67.0,0,False,0,603.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,49.0,0,False,0,617.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
405,74.0,1,True,1,1798.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# getting the features for the model
X = mergedMetaMut.iloc[:, 2:]

# getting the labels for the model
y = mergedMetaMut['label']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train: ', len(X_train))
print('X_test:  ', len(X_test))

X_train:  325
X_test:   82


In [52]:
clf2 = svm.SVC(kernel='linear')
clf2 = classifier.fit(X_train, y_train)

# getting training accuracy
train_accuracy = classi.score(X_train, y_train)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 1.0


In [53]:
y_pred = classifier.predict(X_test)

# getting testing accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


# Running More Tests

In [70]:
y = labels_raw
expression = pd.read_csv('data/Gene_expression.csv')
meta_file = y.merge(expression)
meta_file = meta_file.drop(['sample_type'], axis=1)
meta_file.columns = meta_file.columns.str.replace(r'\|.*', '')

metadata_final = pd.read_csv('data/metadata_final_no_os.csv')

metadata_final['gender'] = metadata_final['gender'].replace({'MALE': 0, 'FEMALE': 1})

metadata_final['tumor_stage'] = metadata_final['tumor_stage'].replace({'Early': 0, 'Late': 1})

metadata_final['is_smoker'] = metadata_final['is_smoker'].replace({bool(False): 0, bool(True): 1})

meta_file = meta_file.merge(metadata_final, how='left', on='patient_id')

meta_file = meta_file.drop(['patient_id'], axis=1)

  meta_file.columns = meta_file.columns.str.replace(r'\|.*', '')


In [71]:
meta_file

Unnamed: 0,label,?,?.1,?.2,?.3,?.4,?.5,?.6,?.7,?.8,...,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker
0,0,0.115255,0.097686,-1.940808,-0.033268,1.057022,1.530721,0.136966,0.684342,1.622239,...,-0.233740,-1.752495,0.980841,-0.505237,-0.279620,-0.447237,70.0,0.0,1.0,1.0
1,1,-0.233267,0.219007,-0.289184,0.202607,-0.069052,0.029955,0.027188,-0.067842,0.106420,...,0.422735,-0.120232,-0.112752,-0.890109,0.314648,0.096801,67.0,0.0,0.0,1.0
2,0,0.248854,-0.384813,-0.533564,-0.173685,0.583903,0.123771,-0.210492,-0.304392,2.015219,...,-0.275621,0.608923,0.369427,1.273709,0.374660,-0.019820,,,,
3,0,-1.015130,-1.252003,-0.898805,-1.181021,0.757503,-0.013923,0.260217,0.220563,0.982440,...,0.782634,-0.680088,-0.150073,-0.171071,1.255729,0.236221,66.0,0.0,1.0,1.0
4,1,-0.816478,0.346711,-1.143054,1.242887,-1.570895,-0.166251,-0.193914,1.189426,-1.087442,...,-0.065455,0.673791,-1.420624,-0.711757,-0.833433,0.356558,70.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,0,0.593944,0.996343,-0.009038,0.047639,0.668383,-1.208640,-0.104800,0.017354,0.015833,...,0.222568,-0.210608,-0.386059,0.289729,0.667480,-0.396243,56.0,1.0,0.0,1.0
402,0,-0.069018,-0.011506,-0.817153,-0.316691,1.148248,2.908490,-1.628148,0.723387,1.853790,...,1.156270,0.476298,-0.242561,-0.339250,1.557140,-0.263652,67.0,0.0,0.0,0.0
403,0,1.749735,2.648401,-0.450322,-0.660086,1.016852,0.941106,-1.506687,-1.028552,-0.080372,...,0.543399,-0.541973,-1.441193,-1.226014,0.813458,-1.338024,49.0,0.0,1.0,0.0
404,1,0.486836,1.062102,-1.368185,0.780825,0.141787,0.123525,-0.071799,0.739805,1.237751,...,0.724190,-1.841894,-0.135198,0.430325,0.959205,-1.000029,74.0,1.0,0.0,1.0


In [77]:
prog_gene = pd.read_csv('data/prognosis_genes.txt')
meta_file.columns = [header.split('|')[0].strip() for header in meta_file.columns]

additional_columns = np.concatenate((metadata_final.columns[1:], [meta_file.columns[0]]))

valid_genes = [col for col in prog_gene['GENE'] if col in meta_file.columns]

columns_to_include = np.concatenate((valid_genes, additional_columns))

prog_meta_file = meta_file[columns_to_include]
prog_meta_file = prog_meta_file.fillna(0)

In [78]:
prog_meta_file

Unnamed: 0,BAG1,CASP4,FADD,ITGA2,KRT18,KRT19,KRT7,LAMB1,BMP2,CDC6,...,RPS6KB1,TMF1,FEZ2,KIAA0020,KIAA0317,age_at_initial_pathologic_diagnosis,gender,tumor_stage,is_smoker,label
0,-0.300973,-1.609040,1.144728,1.466476,-0.045449,0.565879,-0.120513,-1.195601,0.775067,0.715580,...,2.714577,1.047157,1.207741,-1.151396,0.546405,70.0,0.0,1.0,1.0,0
1,-0.326736,0.284783,-0.829652,0.872741,-0.429994,-0.556406,-1.453648,-1.133745,-1.459483,-1.149056,...,0.700554,0.840149,1.429372,-0.572703,0.891123,67.0,0.0,0.0,1.0,1
2,-0.809002,0.666188,-0.071178,-0.373269,-0.579386,0.511680,-0.276158,0.927946,-0.569311,0.377111,...,-0.231853,-0.649095,0.706858,-0.922223,0.445354,0.0,0.0,0.0,0.0,0
3,2.153975,-0.408216,-0.112112,-1.143161,0.413813,-0.054186,0.437825,-0.290499,0.525574,-1.870488,...,-0.382774,0.286660,0.277540,-0.979793,0.910829,66.0,0.0,1.0,1.0,0
4,0.946434,-0.220682,1.436611,-0.219743,-0.393587,-0.553767,-1.028831,-2.264036,-0.594682,0.430613,...,-0.721583,-1.758530,-0.384056,-0.535291,0.846636,70.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,1.720983,-0.294207,0.186656,-0.090690,-0.574602,0.432138,-0.153641,-0.108077,0.679123,-0.771427,...,-1.533007,0.313823,-0.575478,0.216629,0.329030,56.0,1.0,0.0,1.0,0
402,0.643838,-1.274854,-0.487768,-1.514758,-0.723718,-2.239129,-0.585106,0.278553,0.358615,-0.845496,...,-2.097207,-0.732777,-1.599209,-0.367065,0.397518,67.0,0.0,0.0,0.0,0
403,0.351929,0.644262,-0.065726,-0.313786,0.209452,0.176847,0.057328,-1.890316,-0.258741,-1.481856,...,-0.004379,-0.687599,-0.616669,-1.207571,-1.565932,49.0,0.0,1.0,0.0,0
404,-1.156108,0.114911,-0.363290,0.628345,-1.067190,-0.835419,0.185254,-1.021971,0.494188,-1.134124,...,-1.358910,-1.883722,-0.607404,-1.195800,-0.097168,74.0,1.0,0.0,1.0,1


## Cross validation classification

In [81]:
X = prog_meta_file.iloc[:, 0:-1]
y = prog_meta_file.iloc[:,-1]

X, y = shuffle(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
print(len(X_train))
print(len(X_test))

324
82


In [87]:
model = LogisticRegression(C=1, penalty='none')
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [91]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('accuracy: ', accuracy)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: ')
print(cm)

accuracy:  0.5487804878048781
Confusion Matrix: 
[[20 18]
 [19 25]]


## CV with SVM

In [96]:
cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
model = svm.SVC(kernel='linear')

scores = cross_val_score(model, X, y, scoring='accuracy',  cv=cv, n_jobs=-1)
recall = cross_val_score(model, X, y, scoring='recall', cv=cv,  n_jobs=-1)
precision = cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=-1)

In [97]:
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
print('Mean Recall: %.3f (%.3f)' % (np.mean(recall), np.std(recall)))
print('Mean Precision: %.3f (%.3f)' % (np.mean(precision), np.std(precision)))

Mean Accuracy: 0.515 (0.044)
Mean Recall: 0.557 (0.064)
Mean Precision: 0.541 (0.041)


In [98]:
cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
model = svm.SVC(kernel='rbf')

scores = cross_val_score(model, X, y, scoring='accuracy',  cv=cv, n_jobs=-1)
recall = cross_val_score(model, X, y, scoring='recall', cv=cv,  n_jobs=-1)
precision = cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=-1)

In [99]:
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
print('Mean Recall: %.3f (%.3f)' % (np.mean(recall), np.std(recall)))
print('Mean Precision: %.3f (%.3f)' % (np.mean(precision), np.std(precision)))

Mean Accuracy: 0.523 (0.020)
Mean Recall: 0.951 (0.117)
Mean Precision: 0.527 (0.011)
