In [6]:
import pandas as pd
import numpy as np

admissions = pd.read_csv("./data/data_lor-kfcv_admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)

shuffled_index = np.random.permutation(admissions.index)
shuffled_admissions = admissions.loc[shuffled_index]
admissions = shuffled_admissions.reset_index()
# .ix is deprecated. Please use (.loc for label based indexing) or (.iloc for positional indexing)
admissions.loc[  0:128, "fold"] = 1
admissions.loc[129:257, "fold"] = 2
admissions.loc[258:386, "fold"] = 3
admissions.loc[387:514, "fold"] = 4
admissions.loc[515:644, "fold"] = 5
# Ensure the column is set to integer type.
admissions["fold"] = admissions["fold"].astype('int')

print(admissions.head())
print(admissions.tail())

   index       gpa         gre  actual_label  fold
0    338  2.995996  555.261041             0     1
1    569  3.591271  673.552753             1     1
2    146  3.445072  577.154385             0     1
3    488  3.470824  647.706576             1     1
4    545  3.277949  720.062434             1     1
     index       gpa         gre  actual_label  fold
639    145  2.804895  544.943178             0     5
640    133  3.474318  661.969399             0     5
641    161  3.218202  684.068073             0     5
642    364  3.348772  605.968197             0     5
643    332  3.287633  496.568852             0     5


In [7]:
from sklearn.linear_model import LogisticRegression
# Training
model = LogisticRegression()
train_iteration_one = admissions[admissions["fold"] != 1]
test_iteration_one = admissions[admissions["fold"] == 1]
model.fit(train_iteration_one[["gpa"]], train_iteration_one["actual_label"])

# Predicting
labels = model.predict(test_iteration_one[["gpa"]])
test_iteration_one["predicted_label"] = labels
#print(test_iteration_one)

matches = test_iteration_one["predicted_label"] == test_iteration_one["actual_label"]
correct_predictions = test_iteration_one[matches]
iteration_one_accuracy = len(correct_predictions) / float(len(test_iteration_one))
print(iteration_one_accuracy)

0.6976744186046512


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [8]:
import numpy as np
fold_ids = [1,2,3,4,5]
def train_and_test(df, folds):
    fold_accuracies = []
    for fold in folds:
        model = LogisticRegression()
        train = admissions[admissions["fold"] != fold]
        test = admissions[admissions["fold"] == fold]
        model.fit(train[["gpa"]], train["actual_label"])
        labels = model.predict(test[["gpa"]])
        test["predicted_label"] = labels

        matches = test["predicted_label"] == test["actual_label"]
        correct_predictions = test[matches]
        fold_accuracies.append(len(correct_predictions) / float(len(test)))
    return(fold_accuracies)

accuracies = train_and_test(admissions, fold_ids)
print(accuracies)
average_accuracy = np.mean(accuracies)
print(average_accuracy)

[0.6976744186046512, 0.6046511627906976, 0.6046511627906976, 0.6328125, 0.6201550387596899]
0.6319888565891473


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


<img src="./figure/fig_kfold_parameter.png" alt="FAO" width="600" align="left">

In [9]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

admissions = pd.read_csv("./data/data_lor-kfcv_admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)

kf = KFold(len(admissions), 5, shuffle=True, random_state=8)
lr = LogisticRegression()
#roc_auc 
accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"], scoring="accuracy", cv=kf)
# scoring = "accuracy" or "roc_auc"
average_accuracy = sum(accuracies) / len(accuracies)

print(accuracies)
print(average_accuracy)

[0.6124031  0.65891473 0.64341085 0.6744186  0.6328125 ]
0.6443919573643411
