# Load the Data

In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [2]:
!pip install psycopg2 --quiet

In [3]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor

In [4]:
from os import chdir
chdir('/home/jovyan/madelon/')

In [5]:
pwd

'/home/jovyan/madelon'

In [6]:
import functions.db_helper as db

### Load UCI Madelon Datasets
https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/

In [7]:
madelon_test = './assets/madelon_test.data'
madelon_train = './assets/madelon_train.data'
madelon_train_labels = './assets/madelon_train.labels'
madelon_valid = './assets/madelon_valid.data'

In [8]:
madelon_test_df = pd.read_csv(madelon_test, delimiter=' ', header=None)
madelon_test_df.shape
madelon_test_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
153,473,469,555,480,516,471,477,476,486,489,...,480,500,392,511,531,481,494,502,477,
80,468,532,547,495,533,477,468,478,500,487,...,482,455,630,564,469,472,492,464,472,
521,487,457,485,478,513,478,455,477,479,473,...,480,416,661,507,520,469,517,500,498,
1107,478,460,583,512,568,465,479,476,479,473,...,479,443,398,494,486,481,485,497,497,
1652,482,522,486,487,431,479,452,476,482,483,...,479,482,415,521,502,473,473,441,476,


In [9]:
madelon_train_df = pd.read_csv(madelon_train, delimiter=' ', header=None)
madelon_train_df.shape
madelon_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,485,477,537,479,452,471,491,476,475,473,...,481,477,485,511,485,481,479,475,496,
1,483,458,460,487,587,475,526,479,485,469,...,478,487,338,513,486,483,492,510,517,
2,487,542,499,468,448,471,442,478,480,477,...,481,492,650,506,501,480,489,499,498,
3,480,491,510,485,495,472,417,474,502,476,...,480,474,572,454,469,475,482,494,461,
4,484,502,528,489,466,481,402,478,487,468,...,479,452,435,486,508,481,504,495,511,


In [10]:
madelon_train_labels_df = pd.read_csv(madelon_train_labels, delimiter=' ', header=None, names=['target'])
madelon_train_labels_df.shape
madelon_train_labels_df.head()

Unnamed: 0,target
0,-1
1,-1
2,-1
3,1
4,1


Concatenate the target for the train set with the train set before sampling because then it will assure that we can test whether we ultimately classified the -1 or 1 correctly.

In [11]:
madelon_train_withlabels = pd.concat([madelon_train_df, madelon_train_labels_df], axis=1)
madelon_train_sample200 = madelon_train_withlabels.sample(200)
madelon_train_sample200.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,500,target
1373,475,515,528,488,511,475,492,477,478,475,...,506,468,477,494,484,450,453,476,,-1
1151,479,469,532,467,536,476,455,475,473,472,...,440,657,500,533,477,484,504,480,,1
553,474,489,495,472,567,475,486,478,493,483,...,535,616,484,487,485,481,427,510,,1
1999,474,493,469,486,521,475,494,479,481,473,...,508,449,463,533,481,489,516,516,,1
846,485,471,483,483,576,477,512,478,512,472,...,485,474,475,485,480,486,472,465,,-1


In [12]:
madelon_valid_df = pd.read_csv(madelon_valid, delimiter=' ', header=None)
madelon_valid_df.shape
madelon_valid_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
303,481,509,522,461,470,475,507,479,469,471,...,482,445,737,466,541,492,474,503,442,
363,494,447,545,482,452,488,444,478,470,471,...,480,427,593,493,516,483,474,485,459,
385,491,489,515,477,460,476,451,476,457,477,...,474,474,374,578,530,468,480,511,563,
292,474,476,482,479,544,483,470,476,480,478,...,475,474,605,491,489,471,510,546,494,
131,489,420,446,482,447,498,494,477,491,472,...,480,470,662,494,527,480,451,544,506,


In [13]:
test10 = madelon_test_df.sample(180)
train10 = madelon_train_sample200.drop([500,'target'], axis=1).sample(200)
train_label10 = madelon_train_sample200['target']
valid10 = madelon_valid_df.sample(60)

In [14]:
test10.to_pickle('./assets/madelon_test_10.p')
train10.to_pickle('./assets/madelon_train_10.p')
train_label10.to_pickle('./assets/madelon_train_label10.p')
valid10.to_pickle('./assets/madelon_valid10.p')

### Load `make_classification` Dataset from Josh Cook's Database

In [15]:
con = pg2.connect(host='34.211.227.227', dbname='postgres', user='postgres')
cur = con.cursor(cursor_factory=RealDictCursor)

cur.execute('SELECT * FROM madelon LIMIT 440;')
results = cur.fetchall()
con.close()

In [16]:
cook_sample = pd.DataFrame(results)
cook_sample.head(5)

Unnamed: 0,_id,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999,target
0,114800,-0.074348,-1.252552,0.630221,-1.12269,1.139518,0.189537,-0.678695,0.051757,-1.174615,...,-1.282579,-0.458595,0.567019,0.187964,0.046298,-1.950661,-0.995714,0.61775,-0.385676,0
1,114801,0.239157,-1.341087,-1.212999,-0.249455,-0.161141,0.304798,1.433846,1.709565,1.758284,...,-0.681432,-1.108736,1.148655,0.016397,-0.214147,-1.593588,-0.725209,-0.933076,-0.254205,0
2,114802,-0.839076,-0.040565,-1.430879,-0.731206,0.79556,-1.08302,-0.963314,0.8541,-0.720725,...,0.262701,1.850818,0.01763,-0.005701,-0.71804,0.235951,0.476766,0.173116,0.579315,0
3,114803,-1.037762,1.487247,1.957065,-0.276006,1.214798,-0.623143,-0.862611,1.121597,-0.18289,...,-0.184377,-0.521978,0.613497,-0.700089,0.493539,1.083439,-0.678583,0.886047,0.29332,1
4,114804,0.91226,-1.225089,-1.188008,0.82687,0.207841,-1.109184,0.622309,0.664484,1.502021,...,-0.604963,0.265592,2.337011,-1.40473,-0.843114,-0.843866,-0.596798,1.288104,0.110843,0


In [17]:
cook_sample.shape

(440, 1002)

In [18]:
cook_sample.to_pickle('./assets/cook_sample.p')