# Load the Data

In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [2]:
!pip install psycopg2 --quiet

In [3]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor

In [5]:
pwd

'/home/jovyan/ipynb'

### Load UCI Madelon Datasets
https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/

In [9]:
madelon_test = '../assets/madelon_test.data'
madelon_train = '../assets/madelon_train.data'
madelon_train_labels = '../assets/madelon_train.labels'
madelon_valid = '../assets/madelon_valid.data'

In [11]:
madelon_test_df = pd.read_csv(madelon_test, delimiter=' ', header=None)
madelon_test_df.shape
madelon_test_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,483,460,559,477,462,491,499,475,495,479,...,480,546,630,434,497,479,503,465,494,
1,484,509,538,473,548,497,551,477,498,471,...,482,493,442,440,587,475,483,509,455,
2,476,529,558,477,505,486,545,477,473,480,...,482,465,389,460,459,484,476,564,521,
3,487,475,480,494,477,472,512,477,486,480,...,476,536,622,523,445,481,490,481,548,
4,486,490,480,471,520,481,437,476,484,482,...,473,474,491,516,515,475,497,453,497,


In [12]:
madelon_train_df = pd.read_csv(madelon_train, delimiter=' ', header=None)
madelon_train_df.shape
#madelon_train_df.head()

(2000, 501)

In [13]:
madelon_train_labels_df = pd.read_csv(madelon_train_labels, delimiter=' ', header=None, names=['target'])
madelon_train_labels_df.shape
#madelon_train_labels_df.head()

(2000, 1)

Concatenate the target for the train set with the train set before sampling because then it will assure that we can test whether we ultimately classified the -1 or 1 correctly.

##### Get 3 samples of 10% of each of the Madelon and Cook `make_classification` datasets

In [14]:
madelon_train_withlabels = pd.concat([madelon_train_df, madelon_train_labels_df], axis=1)
madelon_train_sample200_1 = madelon_train_withlabels.sample(200)
madelon_train_sample200_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,500,target
1239,492,526,464,476,500,479,445,475,488,467,...,489,456,487,486,484,467,507,469,,1
1252,479,488,615,478,476,485,471,476,496,475,...,472,287,479,493,473,465,496,484,,-1
988,474,505,502,469,494,477,409,478,500,485,...,497,482,482,521,482,482,469,456,,-1
457,493,499,534,491,489,480,530,479,487,473,...,496,383,477,486,491,472,549,499,,1
886,476,518,504,486,502,478,460,474,486,480,...,475,682,506,489,486,468,505,501,,1


In [15]:
madelon_train_withlabels = pd.concat([madelon_train_df, madelon_train_labels_df], axis=1)
madelon_train_sample200_2 = madelon_train_withlabels.sample(200)
madelon_train_sample200_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,500,target
1562,488,481,512,481,452,470,588,476,477,473,...,471,584,499,521,471,483,518,516,,-1
1723,482,484,496,495,502,474,406,477,496,471,...,493,491,462,520,487,482,492,495,,-1
881,488,435,488,488,604,493,504,479,494,480,...,444,249,468,502,478,490,532,488,,-1
1735,482,510,485,489,458,476,470,476,469,477,...,482,563,480,505,476,463,455,533,,-1
802,480,507,371,480,470,480,444,476,486,488,...,425,575,539,608,476,490,579,501,,1


In [16]:
madelon_train_withlabels = pd.concat([madelon_train_df, madelon_train_labels_df], axis=1)
madelon_train_sample200_3 = madelon_train_withlabels.sample(200)
madelon_train_sample200_3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,500,target
1742,492,430,459,490,551,479,503,478,487,467,...,507,339,493,588,491,486,460,495,,1
976,486,483,539,482,501,479,512,478,494,488,...,467,501,454,532,482,435,457,515,,-1
349,480,488,498,481,481,476,445,477,465,472,...,496,517,494,513,478,468,440,509,,1
1409,488,480,524,479,547,479,443,475,499,488,...,482,554,561,551,481,471,531,490,,1
1076,486,411,510,477,507,480,527,477,493,483,...,499,639,465,522,482,508,554,507,,-1


In [17]:
madelon_data_to_concat = [madelon_train_sample200_1, madelon_train_sample200_2, madelon_train_sample200_3]
madelon_total_samples = pd.concat(madelon_data_to_concat)

In [18]:
madelon_valid_df = pd.read_csv(madelon_valid, delimiter=' ', header=None)
madelon_valid_df.shape
madelon_valid_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,483,454,513,495,523,469,453,477,506,479,...,480,543,259,413,520,485,498,523,510,
1,485,508,493,487,478,472,504,476,479,475,...,480,535,534,514,452,484,495,548,477,
2,483,521,507,475,493,486,421,475,496,483,...,476,498,495,508,528,486,465,508,503,
3,474,504,576,480,553,483,524,478,483,483,...,475,470,463,509,525,479,467,552,517,
4,495,474,523,479,495,488,485,476,497,478,...,471,522,343,509,520,475,493,506,491,


In [19]:
madelon_sample_test = madelon_test_df.sample(180)

madelon_sample_train = madelon_total_samples.drop([500,'target'], axis=1)
madelon_sample_train_labels = madelon_total_samples['target']

madelon_sample_valid = madelon_valid_df.sample(60)

In [21]:
madelon_sample_test.to_pickle('../assets/pickled_samples/madelon_test_10.p')

madelon_sample_train.to_pickle('../assets/pickled_samples/madelon_sample_train.p')
madelon_sample_train_labels.to_pickle('../assets/pickled_samples/madelon_sample_train_labels.p')

madelon_sample_valid.to_pickle('../assets/pickled_samples/madelon_valid10.p')

### Load `make_classification` Dataset from Josh Cook's Database

In [22]:
con = pg2.connect(host='34.211.227.227', dbname='postgres', user='postgres')
cur = con.cursor(cursor_factory=RealDictCursor)

cur.execute('SELECT * FROM madelon ORDER BY RANDOM() LIMIT 2200;')
results1 = cur.fetchall()
con.close()

In [23]:
cook_sample1 = pd.DataFrame(results1)
cook_sample1.head()

Unnamed: 0,_id,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999,target
0,35870,0.371429,1.557665,0.088738,-0.568426,0.279411,0.553009,-2.263165,0.322735,-0.770665,...,-0.006892,0.50468,-1.534529,0.120796,1.48537,0.48177,0.499728,-2.03225,-0.245348,1
1,92973,-0.04449,-1.042972,0.24599,-0.187297,1.355108,-0.086559,-0.409865,-0.156749,-0.388948,...,-0.363377,-0.313348,-0.518927,-0.515884,0.345514,-0.140779,-0.113291,1.16173,-0.616847,0
2,56479,0.043489,0.440636,0.047697,-1.933592,1.219459,0.182951,-2.266318,-0.721327,0.510775,...,-1.811937,0.56398,1.152395,-0.060437,-0.723646,0.529993,1.599007,-0.053172,0.581611,0
3,119182,0.567497,0.313351,0.74473,-1.350541,-0.068053,0.58899,0.29012,0.229521,0.848661,...,0.308947,-0.31562,-0.703337,0.499579,1.128113,-0.067667,0.086741,-0.861531,0.361497,1
4,34782,-0.111258,-0.682838,-0.561106,-0.38156,0.54049,-1.115611,-1.473109,0.728398,-0.267222,...,1.522104,-0.325736,0.967692,0.919213,2.192081,-0.672942,-0.325374,-0.782994,0.662257,1


In [24]:
con = pg2.connect(host='34.211.227.227', dbname='postgres', user='postgres')
cur = con.cursor(cursor_factory=RealDictCursor)

cur.execute('SELECT * FROM madelon ORDER BY RANDOM() LIMIT 2200;')
results2 = cur.fetchall()
con.close()

In [25]:
cook_sample2 = pd.DataFrame(results2)
cook_sample2.head()

Unnamed: 0,_id,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999,target
0,188089,-0.853696,-0.736593,0.411037,1.798703,0.86677,0.835296,1.313711,-0.879634,-1.027186,...,-0.120383,0.988193,0.715386,-1.154979,0.824831,0.547006,1.855144,1.590095,-0.821332,0
1,85615,-0.687035,-1.647364,-0.101136,0.683398,-1.196019,1.15481,-0.900697,1.190874,1.2496,...,0.168756,-0.20582,-0.26222,-0.078427,0.554429,-2.013667,-0.941555,1.39322,-0.691762,1
2,82789,-0.966237,-0.409867,-1.135432,0.673216,-0.840924,0.995096,-0.042792,-0.307372,0.17518,...,-0.880003,1.195717,-0.332533,0.013611,0.889108,-0.785001,-0.048285,-2.592333,-1.072066,1
3,665,0.696994,-0.709434,2.359885,-1.918594,0.947035,1.273983,-0.922619,0.485947,-0.890566,...,1.803296,0.387747,-1.337901,1.223919,-0.767111,-1.060457,0.589672,0.87236,0.139395,1
4,83328,-1.087548,-0.499606,1.500752,0.42234,0.41161,0.750327,0.007554,-1.087686,-1.342041,...,0.288257,0.934479,-0.484844,1.713115,0.547932,-1.349371,2.424314,3.1078,0.881678,0


In [26]:
con = pg2.connect(host='34.211.227.227', dbname='postgres', user='postgres')
cur = con.cursor(cursor_factory=RealDictCursor)

cur.execute('SELECT * FROM madelon ORDER BY RANDOM() LIMIT 2200;')
results3 = cur.fetchall()
con.close()

In [27]:
cook_sample3 = pd.DataFrame(results3)
cook_sample3.head()

Unnamed: 0,_id,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999,target
0,69225,-0.695779,-0.444988,0.357418,-0.406618,1.09639,0.976347,1.007848,0.068685,0.792589,...,0.520603,1.975369,-0.646134,2.062182,0.330405,-1.266178,0.520335,0.526839,-0.349781,1
1,146372,-0.41643,-1.543888,-0.33889,0.070952,-0.352189,-1.310727,-0.393519,1.740216,-0.809619,...,-0.313796,0.522306,0.155355,0.642186,0.29321,1.530777,-0.870335,-1.338096,-0.36237,0
2,111034,-0.09003,-1.317241,-0.598129,-1.665228,-0.310629,0.419445,-0.730926,0.410461,-0.315639,...,0.395633,1.576831,0.883465,0.525049,-0.684231,0.02752,0.463564,-0.052154,1.069773,0
3,81624,0.423259,0.713292,1.339567,0.616029,-0.237246,-1.550371,0.951752,-0.93876,0.653387,...,-1.696794,-0.700529,-0.568613,0.266188,0.379904,-0.267126,0.995271,0.587576,0.350833,0
4,19683,0.941942,-1.060337,1.597402,1.938867,0.312038,0.107562,1.512232,0.477188,0.745874,...,1.437815,0.112347,-0.704987,0.696009,-0.586341,-0.109626,-0.712362,-0.602433,-0.652868,1


In [28]:
samples_to_concat = [cook_sample1, cook_sample2, cook_sample3]
cook_total_samples = pd.concat(samples_to_concat)

In [30]:
cook_total_samples.to_pickle('../assets/pickled_samples/cook_total_samples.p')