# Load the Data

In [2]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [3]:
!pip install psycopg2 --quiet

In [4]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor

In [5]:
from os import chdir
chdir('/home/jovyan/madelon/')

In [6]:
pwd

'/home/jovyan/madelon'

In [7]:
import functions.db_helper as db

### Load UCI Madelon Datasets
https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/

In [8]:
madelon_test = './assets/madelon_test.data'
madelon_train = './assets/madelon_train.data'
madelon_train_labels = './assets/madelon_train.labels'
madelon_valid = './assets/madelon_valid.data'

In [14]:
madelon_test_df = pd.read_csv(madelon_test, delimiter=' ', header=None)
madelon_test_df.shape
madelon_test_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,483,460,559,477,462,491,499,475,495,479,...,480,546,630,434,497,479,503,465,494,
1,484,509,538,473,548,497,551,477,498,471,...,482,493,442,440,587,475,483,509,455,
2,476,529,558,477,505,486,545,477,473,480,...,482,465,389,460,459,484,476,564,521,
3,487,475,480,494,477,472,512,477,486,480,...,476,536,622,523,445,481,490,481,548,
4,486,490,480,471,520,481,437,476,484,482,...,473,474,491,516,515,475,497,453,497,


In [10]:
madelon_train_df = pd.read_csv(madelon_train, delimiter=' ', header=None)
madelon_train_df.shape
#madelon_train_df.head()

(2000, 501)

In [11]:
madelon_train_labels_df = pd.read_csv(madelon_train_labels, delimiter=' ', header=None, names=['target'])
madelon_train_labels_df.shape
#madelon_train_labels_df.head()

(2000, 1)

Concatenate the target for the train set with the train set before sampling because then it will assure that we can test whether we ultimately classified the -1 or 1 correctly.

In [12]:
madelon_train_withlabels = pd.concat([madelon_train_df, madelon_train_labels_df], axis=1)
madelon_train_sample200 = madelon_train_withlabels.sample(200)
madelon_train_sample200.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,500,target
570,484,434,502,474,526,475,527,478,498,481,...,523,602,537,510,481,478,520,503,,-1
878,471,507,519,478,485,486,447,477,520,478,...,479,398,498,603,474,479,500,488,,1
941,486,505,491,489,485,474,486,476,494,476,...,494,429,469,489,478,490,525,463,,-1
1952,484,428,531,498,516,481,482,478,480,488,...,500,452,550,473,474,483,535,508,,1
800,483,464,452,472,492,476,543,477,478,488,...,476,421,590,529,474,492,460,481,,1


In [15]:
madelon_valid_df = pd.read_csv(madelon_valid, delimiter=' ', header=None)
madelon_valid_df.shape
madelon_valid_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,483,454,513,495,523,469,453,477,506,479,...,480,543,259,413,520,485,498,523,510,
1,485,508,493,487,478,472,504,476,479,475,...,480,535,534,514,452,484,495,548,477,
2,483,521,507,475,493,486,421,475,496,483,...,476,498,495,508,528,486,465,508,503,
3,474,504,576,480,553,483,524,478,483,483,...,475,470,463,509,525,479,467,552,517,
4,495,474,523,479,495,488,485,476,497,478,...,471,522,343,509,520,475,493,506,491,


In [None]:
test10 = madelon_test_df.sample(180)
train10 = madelon_train_sample200.drop([500,'target'], axis=1)
train_label10 = madelon_train_sample200['target']
valid10 = madelon_valid_df.sample(60)

In [None]:
test10.to_pickle('./assets/madelon_test_10.p')
train10.to_pickle('./assets/madelon_train_10.p')
train_label10.to_pickle('./assets/madelon_train_label10.p')
valid10.to_pickle('./assets/madelon_valid10.p')

### Load `make_classification` Dataset from Josh Cook's Database

In [None]:
con = pg2.connect(host='34.211.227.227', dbname='postgres', user='postgres')
cur = con.cursor(cursor_factory=RealDictCursor)

cur.execute('SELECT * FROM madelon LIMIT 440;')
results = cur.fetchall()
con.close()

In [None]:
con = pg2.connect(host='34.211.227.227', dbname='postgres', user='postgres')
cur = con.cursor(cursor_factory=RealDictCursor)

cur.execute('SELECT * FROM madelon LIMIT 440;')
results2 = cur.fetchall()
con.close()

In [16]:
cook_sample = pd.DataFrame(results)
cook_sample.head(5)

Unnamed: 0,_id,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999,target
0,130288,-1.558938,0.559621,0.934527,0.192429,0.130179,-0.125496,0.133544,0.196655,-0.090318,...,-0.660494,-0.820007,-0.242569,-0.459709,-0.063038,0.621087,0.551693,0.124573,1.623202,0
1,130289,-0.368301,0.842603,-0.334257,0.397181,1.552486,0.405636,0.646928,0.603076,0.199934,...,1.263449,-1.323903,1.215217,1.482974,1.071561,1.08985,-0.658328,-0.258123,-0.021302,0
2,130290,-1.250626,0.585656,1.212785,-0.068488,-0.841297,-1.296333,1.240742,-0.164829,-0.782004,...,-0.035905,-0.435833,-2.916269,1.37032,0.032917,0.369725,-1.793411,0.82209,0.545751,1
3,130291,-0.396239,0.795522,-0.827189,-0.93658,-0.323504,-0.70788,0.525449,-0.761152,-0.73507,...,1.227725,0.086881,-0.668349,1.896989,0.138205,1.990027,-0.026086,-0.181649,-0.127934,0
4,130292,-0.010238,-0.656922,0.387079,-1.201123,-1.597871,-0.077609,0.358021,-0.906993,0.571607,...,-1.9317,-0.843277,-0.819254,-2.191828,-1.119522,-0.194287,0.572806,0.173104,-0.140039,1


In [17]:
cook_sample.shape

(440, 1002)

In [18]:
cook_sample.to_pickle('./assets/cook_sample.p')