### Data Analysis on MCI and ADD patient Data

In [35]:
%matplotlib inline
import numpy as np
import pandas as pd
np.random.seed(45)
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pandas.plotting import table
from scipy import interp

from sklearn.feature_selection import SelectKBest, f_classif
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score,log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


1. Helper Functions that will come in Handy

In [46]:
# Create table for missing data analysis
def draw_missing_data_table(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)*100
    missing_data = pd.concat([total, percent], axis=1, keys=['Total missing Values', '% of Missing Values'])

    return missing_data

def drop_null_values(data, limit_percentage):
    thresh = len(data) * (limit_percentage/100)
    print(thresh)
    return data.dropna(thresh = thresh, axis = 1, inplace = True)

In [37]:
def load_data(file_path):
    return pd.read_excel(file_path)    

1. Loading dataset as is

In [38]:
data_mci = load_data("data/AD_PAGN_GN_C01_00121_MCI.xlsx")
data_controls = load_data("data/AD_PAGN_GN_C01_00141_control.xlsx")
data_add         = load_data("data/AD_PAGN_GN_C01_00240_AD.xlsx")

In [39]:
data_mci.head(5)

Unnamed: 0,Recording timestamp,Project name,Export date,Participant name,Gender,Glasses,Age,Recording name,Recording date,Recording start time,...,Client area position X (DACSpx),Client area position Y (DACSpx),Viewport position X,Viewport position Y,Viewport width,Viewport height,Full page width,Full page height,Mouse position X,Mouse position Y
0,0,AD_PAGN,03/05/2020 09:28:24,C01_121,Female,Yes,≥71,GN_C01_121,07/08/2019 13:22:56,13:22:56.310,...,,,,,,,,,,
1,82,AD_PAGN,03/05/2020 09:28:24,C01_121,Female,Yes,≥71,GN_C01_121,07/08/2019 13:22:56,13:22:56.310,...,,,,,,,,,,
2,170,AD_PAGN,03/05/2020 09:28:24,C01_121,Female,Yes,≥71,GN_C01_121,07/08/2019 13:22:56,13:22:56.310,...,,,,,,,,,,
3,173,AD_PAGN,03/05/2020 09:28:24,C01_121,Female,Yes,≥71,GN_C01_121,07/08/2019 13:22:56,13:22:56.310,...,,,,,,,,,,
4,177,AD_PAGN,03/05/2020 09:28:24,C01_121,Female,Yes,≥71,GN_C01_121,07/08/2019 13:22:56,13:22:56.310,...,,,,,,,,,,


In [40]:
data_mci.describe()

Unnamed: 0,Recording timestamp,Recording duration,Recording resolution height,Recording resolution width,Recording monitor latency,Average calibration accuracy (mm),Average calibration precision SD (mm),Average calibration precision RMS (mm),Average calibration accuracy (degrees),Average calibration precision SD (degrees),...,Client area position X (DACSpx),Client area position Y (DACSpx),Viewport position X,Viewport position Y,Viewport width,Viewport height,Full page width,Full page height,Mouse position X,Mouse position Y
count,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,184.0,184.0
mean,161905.446026,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,...,,,,,,,,,-392.994565,877.668478
std,93365.948931,0.0,0.0,0.0,0.0,0.0,4.849035e-12,0.0,7.810459e-14,6.103482e-14,...,,,,,,,,,167.356251,75.988667
min,0.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,...,,,,,,,,,-742.0,643.0
25%,81051.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,...,,,,,,,,,-493.0,821.75
50%,161904.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,...,,,,,,,,,-330.0,866.0
75%,242760.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,...,,,,,,,,,-297.0,934.25
max,323753.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,...,,,,,,,,,-96.0,1003.0


# Understanding the Dataset

Establishing the extent of Missing values in the Datasets. We may drop features with 100% missing values for the start

1. Dealing with Mild Cognitive Impairment(MCI) data
We have dropped the Following features from our dataset since thhey have 100% missing values
AOI hit [TA_Lt - TA_Lt_L]
AOI hit [TA_Lt - TA_Lt_C] 	
AOI hit [CueP - CueP_C] 	
AOI hit [TP_Rt - TP_Rt_R] 
AOI hit [TP_Rt - TP_Rt_L] 	
AOI hit [TP_Rt - TP_Rt_C] 
AOI hit [TP_Lt - TP_Lt_R] 
AOI hit [TA_Rt - TA_Rt_C] 	
AOI hit [TA_Rt - TA_Rt_L] 	
AOI hit [TA_Rt - TA_Rt_R] 	
AOI hit [CueA - CueA_C] 	
AOI hit [CueA - CueA_L] 	
AOI hit [CueA - CueA_R] 	
AOI hit [FixP - FixP] 	
AOI hit [CueP - CueP_L] 	
AOI hit [CueP - CueP_R] 	
AOI hit [TA_Lt - TA_Lt_R] 	
AOI hit [FixA - FixA] 	
AOI hit [TP_Lt - TP_Lt_L] 	
AOI hit [TP_Lt - TP_Lt_C] 	
Client area position X (DACSpx) 	
Client area position Y (DACSpx) 	
Viewport position X 	
Viewport position Y 	
Viewport width 
Viewport height 	
Full page width 	
Full page height 	

In [41]:
draw_missing_data_table(data_mci)

Unnamed: 0,Total missing Values,% of Missing Values
AOI hit [TA_Lt - TA_Lt_L],97889,100.000000
AOI hit [TA_Lt - TA_Lt_C],97889,100.000000
AOI hit [CueP - CueP_C],97889,100.000000
AOI hit [TP_Rt - TP_Rt_R],97889,100.000000
AOI hit [TP_Rt - TP_Rt_L],97889,100.000000
AOI hit [TP_Rt - TP_Rt_C],97889,100.000000
AOI hit [TP_Lt - TP_Lt_R],97889,100.000000
AOI hit [TA_Rt - TA_Rt_C],97889,100.000000
AOI hit [TA_Rt - TA_Rt_L],97889,100.000000
AOI hit [TA_Rt - TA_Rt_R],97889,100.000000


In [47]:
From those columns you can filter out the features with more than 80% NULL values and then drop those columns from the DataFrame.
data_mci_clean = drop_null_values(data_mci,100)

97889.0


In [53]:
data_mci.describe()

Unnamed: 0,Recording timestamp,Recording duration,Recording resolution height,Recording resolution width,Recording monitor latency,Average calibration accuracy (mm),Average calibration precision SD (mm),Average calibration precision RMS (mm),Average calibration accuracy (degrees),Average calibration precision SD (degrees),Average calibration precision RMS (degrees),Average calibration accuracy (pixels),Average calibration precision SD (pixels),Average calibration precision RMS (pixels),AOI hit [Target:Correct],AOI hit [Target:Uncorrect]
count,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0,97889.0
mean,161905.446026,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,0.16,15.0,10.0,7.0,0.268896,0.073542
std,93365.948931,0.0,0.0,0.0,0.0,0.0,4.849035e-12,0.0,7.810459e-14,6.103482e-14,1.218198e-13,0.0,0.0,0.0,0.443388,0.261026
min,0.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,0.16,15.0,10.0,7.0,0.0,0.0
25%,81051.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,0.16,15.0,10.0,7.0,0.0,0.0
50%,161904.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,0.16,15.0,10.0,7.0,0.0,0.0
75%,242760.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,0.16,15.0,10.0,7.0,1.0,0.0
max,323753.0,323753.0,1080.0,1920.0,10.0,4.0,2.7,2.0,0.34,0.22,0.16,15.0,10.0,7.0,1.0,1.0


In [44]:
data_mci_features= data_mci.iloc[:, [41,42]].dropna()

IndexError: positional indexers are out-of-bounds

In [None]:
data_mci_features #Lets drop the null values

In [None]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
fmri = sns.load_dataset("fmri")
ax = sns.lineplot(x="timepoint", y="signal", data=fmri,hue="event")


In [None]:
fmri.head(5)

In [None]:
# Analyse missing data
data = draw_missing_data_table(df)
#generate_image_for_table(data,924,'missing_data_table')

In [None]:
#df = df[df['Settlement'].notnull()]
#df.head()
#draw_missing_data_table(filtered_df)

In [None]:
#drop columns that contain Nan values 'ID', 'Been_to_prison','Smoker','Contact_with_TB','Contact_with_MDRTB',
                   #'Alcoholic','Income','Occupational_Status'
df.drop(df.columns[[0, 3,6,9,10,11,12,13]], axis=1, inplace=True)

In [None]:
df.head(10)
#df[df['Has_MDR']==0]


In [None]:
df.info()

In [None]:
df.describe()

The count, mean, min and max rows are self-explanatory. The std shows standard deviation. The 25%, 50% and 75% rows show the corresponding percentiles.

To get a feel of what type of the data we are dealing with, we plot a histogram for each numeric attribute.

In [None]:
features= df.iloc[:, 0:8] # first 8 columns of data frame with all rows,  #take out the Independent variables
labels= df.iloc[:,-1] #take out the depedent variable/outcome variable --last column with all rows
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=1)

In [None]:
features=features.filter(["TB_Episodes","Treatment_Reason","HIV_Status","TB_History","Closeness_to_facility","Settlement","Age"])

In [None]:
svc_param_selection(features,labels,10)

In [None]:
dual=[True,False]
max_iter=[100,110,120,130,140]
C = [1.0,1.5,2.0,2.5]
param_grid = dict(dual=dual,max_iter=max_iter,C=C)

import time

lr = LogisticRegression(penalty='l2')
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv = 10, n_jobs=-1)

start_time = time.time()
grid_result = grid.fit(features,labels)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

In [None]:
lg = LogisticRegression()
dt = DecisionTreeClassifier()
sv= SVC(probability=True)
rf=  RandomForestClassifier()

In [None]:
sv.fit(X_train, y_train)
y_predict = sv.predict(X_test)
cm = confusion_matrix(y_test, y_predict)

In [None]:
#performing validation using K fold cross validation
scores= cross_val_score(lg, features, labels, scoring='accuracy',cv=10)
print(scores)

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#SMV has accuracy of 79 % th confidence interval 51%
# LR has accuracy 77% with confidence interval of 51%
# DT has accuracy 69% with confidence interval of 38%
# RF has accuracy 69% with confidence interval of 34%

# Performing Model Evaluation

In [None]:
plt.clf()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['Negative','Positive']
plt.title('MDR-TB or Not MDR-TB Confusion Matrix - SVM')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

In [None]:
plot_ROC(X_test, y_test,lg)

In [None]:
#Performing feature selection on the dataset's features
#Suppose, we select 6 features with top 6 Fisher scores
selector = SelectKBest(score_func=chi2, k = 7) #f_classif
#New dataframe with the selected features for later use in the classifier. fit() method works too, if you want only the feature names and their corresponding scores

X_new = selector.fit_transform(features, labels)
names = features.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]

names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feature', 'Chi-squared Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['Chi-squared Scores', 'Feature'], ascending = [False, False])
print(ns_df_sorted)

generate_image_for_table(ns_df_sorted,614,'feature_scores_table')

In [None]:
features= features.filter(["TB_Episodes","Treatment_Reason","HIV_Status","Closeness_to_facility","TB_History","Settlement"])
compare_classifiers(features,labels,"Classifier Accuracy Comparison (with 6 Best Features)")

In [None]:
from sklearn.model_selection import GridSearchCV

clf = LogisticRegression()

param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

gridsearch = GridSearchCV(clf, param_grid)

gridsearch.fit(features, labels);
gridsearch.best_params_

In [None]:
X.head(1)

In [None]:
X= features.filter(["TB_Episodes","Treatment_Reason","HIV_Status","Closeness_to_facility","TB_History"])

In [None]:
X.head(1)

In [None]:
#Pick only feaures with a higher F-Score
#X= X.filter(["TB_Episodes", "TB_History","Treatment_Reason","HIV_Status"])


In [None]:
lg = LogisticRegression()
dt = DecisionTreeClassifier()
sv= SVC(probability=True)
rf=  RandomForestClassifier()

In [None]:
#x = df.loc[:, df.columns != 'Has_MDR']
x= features.filter(["TB_Episodes","Treatment_Reason","HIV_Status","Closeness_to_facility","TB_History"])
y = df.loc[:,'Has_MDR']

In [None]:
#plot_ROC(y_train_true, y_train_prob, y_test_true, y_test_prob)
plotKFoldROC(rf,features,labels)I

In [None]:
# Plot learning curves
title = "Learning Curves (Logistic Regression)"
svm_title = "Learning Curves (SVM)"
dt_title = "Learning Curves (Decision Tree)"
cv = 10
plot_learning_curve(logit_classifer, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=1);

In [None]:
data = pd.DataFrame(data = knn, columns=['Experiment','Accuracy'])
plt.xlabel('Accuracy')
plt.title("K-Nearest Neighbors Accuracy in the experiments")
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Experiment', data=data, color="b")
plt.savefig("knn_experiment_accuracy_barplot.png")
    

In [None]:
generate_image_for_table(data,614,'knn_experiment_accuracy_table')

In [None]:
#Create a histogram
sns.countplot(x='HIV_Status',data=df, palette='hls')
plt.savefig('hist_HIVStatus.png')
plt.show()

In [None]:
#sns.distplot(df['Gender']);

# Assign colors for gender 
sns.barplot(df['HIV_Status'],df['Has_MDR']);
#plt.title('Influence of Gender on MDR-TB')
plt.savefig('HIVStatus_mdrtb.png')


In [None]:
df.groupby(['HIV_Status']).count()

In [None]:
sns.barplot(df['Treatment_Reason'],df['Gender'],df['Has_MDR']);

In [None]:
# Assign colors for gender 
colors = ['#E69F00', '#56B4E9']
names = ['Negative', 'Positive']
         
# Make the histogram using a list of lists
plt.hist([df['HIV_Status'],df['Has_MDR']],
         color = colors, label=names)

# Plot formatting
plt.legend()
plt.xlabel('MDR-TB')
plt.ylabel('Number of Patients')
plt.title('MDR-TB Vs HIV Status')
plt.savefig('gender.png')

In [None]:
sns.barplot(df['Has_MDR'],df['Treatment_Reason'], df['Gender']);

In [None]:
sns.barplot(df['Has_MDR'],df['Treatment_Reason'], df['Gender']);

In [None]:
sns.barplot(df['Has_MDR'], df['TB_Episodes'],df['HIV_Status']);

In [None]:
df.groupby(['HIV_Status']).count()

In [None]:
# Debug
print('Inputs: \n', X_train.head())
print('Outputs: \n', y_train.head())

In [None]:
#confusion Matrix for the Logit clasifer
y_pred = logit_classifer.predict(X_test)
print("Accuracy Score for Logistic Regression: ")
print(accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test, y_pred)
print("1. Logistic Regression Confusion Matrix\n")
print(confusion_matrix)

In [None]:
#confusion Matrix for the SVM clasifer
y_pred = svm_classifer.predict(X_test)
print("Accuracy Score for SVM Regression: ")
print(accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test, y_pred)
print("2. SVM Confusion Matrix\n")
print(confusion_matrix)

In [None]:
#confusion Matrix for the SVM clasifer
y_pred = tree_classifer.predict(X_test)
print("Accuracy Score for Decision Tree Classifier: ")
print(accuracy_score(y_test,y_pred))
confusion_matrix = confusion_matrix(y_test, y_pred)
print("2. Decison tree classifer Confusion Matrix\n")
print(confusion_matrix)

In [None]:
# Model performance
cv =10
scores = cross_val_score(logit_classifer, X_train, y_train, cv=cv)
svm_scores = cross_val_score(svm_classifer, X_train, y_train, cv=cv)
dt_scores = cross_val_score(tree_classifer, X_train, y_train, cv=cv)
print('Logit CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
print('SVM CV accuracy: %.3f +/- %.3f' % (np.mean(svm_scores), np.std(svm_scores)))
print('Tree CV accuracy: %.3f +/- %.3f' % (np.mean(dt_scores), np.std(dt_scores)))

In [None]:

#plot_learning_curve(svm_classifer, svm_title, X_train, y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=1);
#plot_learning_curve(tree_classifer, dt_title, X_train, y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=1);

In [None]:
# Plot validation curve
title = 'Validation Curve (Logistic Regression)'
param_name = 'C'
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
cv = 10
plot_validation_curve(estimator=logit_classifer, title=title, X=X_train, y=y_train, param_name=param_name,
                      ylim=(0.5, 1.01), param_range=param_range);

In [None]:
y_svm_pred = svm_classifer.predict(X_test)
from sklearn.metrics import confusion_matrix
svm_confusion_matrix = confusion_matrix(y_test, y_svm_pred)
print("2. SVM Confusion Matrix\n")
print(svm_confusion_matrix)

In [None]:
y_dtree_pred = tree_classifer.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score

dtree_confusion_matrix = confusion_matrix(y_test, y_dtree_pred)
print("3. Decision Trees Confusion Matrix\n")
print(dtree_confusion_matrix)

In [None]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logit_classifer.score(X_test, y_test)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm_classifer.score(X_test, y_test)))
print('Accuracy of  Decision Tree classifier on test set: {:.2f}'.format(tree_classifer.score(X_test, y_test)))


Performing Exploratory data analysis on the Data


In [None]:
sns.countplot(x="TB_Episodes", data=df)
plt.show()

As we can see, each of the genders is equally affected by the MDR-TB, that is 60%.

In [None]:
# Plot
limit_1 = 12
limit_2 = 50

x_limit_1 = np.size(df[df['Age'] < limit_1]['Age'].unique())
x_limit_2 = np.size(df[df['Age'] < limit_2]['Age'].unique())

plt.figure(figsize=(25,10))
sns.barplot(df['Age'],df['Has_MDR'], ci=None)

plt.axvspan(-1, x_limit_1, alpha=0.25, color='green')
plt.axvspan(x_limit_1, x_limit_2, alpha=0.25, color='red')
plt.axvspan(x_limit_2, 100, alpha=0.25, color='yellow')

plt.xticks(rotation=90);



In [None]:
# Bin data
df['AgeRange'] = pd.cut(df['Age'], bins=[0, 12, 50, 200], labels=['Child','Adult','Elder'])
df['AgeRange'].head()

In [None]:
# Plot
sns.barplot(df['Age'], df['Has_MDR']);
plt.savefig('ageRange_mdrtb.png')

In [None]:
sns.countplot(x='Age',data=df, palette='hls')
plt.savefig('hist_ageRange.png')
plt.show()

In [None]:
df.groupby(['Age']).count()

In [None]:
## Get score using original model

# Get polynomial features
from sklearn.preprocessing import PolynomialFeatures

X_train_transformed = X_train
X_test_transformed = X_test

# Rescale data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_transformed_scaled = scaler.fit_transform(X_train_transformed)
X_test_transformed_scaled = scaler.transform(X_test_transformed)

poly = PolynomialFeatures(degree=2).fit(X_train_transformed)
X_train_poly = poly.transform(X_train_transformed_scaled)
X_test_poly = poly.transform(X_test_transformed_scaled)


logreg = LogisticRegression(C=1)
logreg.fit(X_train, y_train)
scores = cross_val_score(logreg, X_train, y_train, cv=10)
print('CV accuracy (original): %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
highest_score = np.mean(scores)

## Get score using models with feature selection
for i in range(1, X_train_poly.shape[1]+1, 1):
    # Select i features
    select = SelectKBest(score_func=chi2, k=i)
    select.fit(X_train_poly, y_train)
    X_train_poly_selected = select.transform(X_train_poly)

    # Model with i features selected
    logreg.fit(X_train_poly_selected, y_train)
    scores = cross_val_score(logreg, X_train_poly_selected, y_train, cv=10)
    print('CV accuracy (number of features = %i): %.3f +/- %.3f' % (i,np.mean(scores),np.std(scores)))
    
    # Save results if best score
    if np.mean(scores) > highest_score:
        highest_score = np.mean(scores)
        std = np.std(scores)
        k_features_highest_score = i
    elif np.mean(scores) == highest_score:
        #if np.std(scores) < std:
        highest_score = np.mean(scores)
        std = np.std(scores)
        k_features_highest_score = i
        
# Print the number of features
print('Number of features when highest score: %i' % k_features_highest_score)


In [None]:
# Select Best features
select = SelectKBest(score_func=chi2, k=k_features_highest_score)
select.fit(X_train_poly, y_train)
X_train_poly_selected = select.transform(X_train_poly)

# Fit model
logreg = LogisticRegression(C=1)
logreg.fit(X_train_poly_selected, y_train)

# Model performance
scores = cross_val_score(logreg, X_train_poly_selected, y_train, cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))