# Demo PGMs

This notebook is an example of how to use the Bayesian models and the discretization processes. The dataset is a part of the AVA dataset and the features are from the phog.

## A bit of set up

We need numpy and pandas for data. Pickle and gzip for read the extracted features. Our folder with the code of our functions.

In [1]:
# set up Python environment: numpy for numerical routines
import numpy as np
import pandas as pd

# for store the results
from six.moves import cPickle as pickle
import gzip

# first, we add the folder with the code to the pythonpath
import sys
sys.path.append('../pycode/')

# we can import a file from the pycode
import utilsData
# or go inside a folder with from
from preprocess import utilities

# even we can select only the functions we need from a file
from preprocess.mdl import MDL_method
from preprocess.unsupervised import Unsupervised_method
from models.nb import Naive_Bayes
from models.aode_fast import AODE_fast

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Now the data

In [2]:
# First we load the features
features_file = '../features/AVA/PHOG/10_bins20_levels1_angle360_redux.arff'
features = utilsData.readARFF(features_file)
features['id'] = features['id'].astype(int)

# we take the name of the features and delete de ID
features_names = np.array(features.columns)
index = np.argwhere(features_names=='id')
features_names = np.delete(features_names, index)

# And now the class and other information
data = pickle.load(gzip.open('../packages/AVA_info.pklz','rb',2))

# and merge all the information by the id
data=data.merge(features, on='id', copy=False)
num_images = data.shape[0]

# to free space
del features

In [3]:
# we only need the features and the class
data_aux = data[np.append(features_names,['Class'])]

# and change the numerical class to categorical
data_aux['Class'] = pd.Categorical(data_aux['Class'],[0,1])
data_aux['Class'].cat.categories=['Snapshot','Professional Shot']

# to free space
del data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
data_aux

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,...,var92,var93,var94,var95,var96,var97,var98,var99,var100,Class
0,0.024859,0.019595,0.025479,0.019152,0.024879,0.044237,0.014605,0.021239,0.021766,0.044068,...,0.002261,0.002495,0.003200,0.006480,0.005550,0.003260,0.002899,0.003092,0.002915,Snapshot
1,0.017954,0.017298,0.019015,0.011463,0.012214,0.078352,0.015601,0.027636,0.024860,0.021917,...,0.006609,0.006342,0.004547,0.003864,0.003626,0.004761,0.006235,0.005235,0.024966,Snapshot
2,0.010668,0.012657,0.015845,0.019103,0.030681,0.044511,0.030432,0.024963,0.018578,0.014926,...,0.003201,0.005084,0.007924,0.016981,0.028977,0.017653,0.009368,0.006688,0.004389,Snapshot
3,0.019313,0.018857,0.017418,0.015704,0.011385,0.014184,0.012291,0.049031,0.075157,0.042130,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Professional Shot
4,0.058041,0.021004,0.025309,0.015239,0.023751,0.015093,0.016500,0.018322,0.019268,0.033029,...,0.007309,0.007081,0.003782,0.003287,0.002867,0.003312,0.003499,0.002475,0.025802,Professional Shot
5,0.074201,0.013060,0.011389,0.013209,0.016832,0.020902,0.017412,0.015080,0.017821,0.033396,...,0.004451,0.003772,0.004422,0.006955,0.007336,0.008979,0.005932,0.005805,0.016598,Professional Shot
6,0.027557,0.031744,0.037061,0.024356,0.020297,0.020910,0.021408,0.028417,0.026356,0.027753,...,0.002496,0.002764,0.001797,0.001513,0.001188,0.001903,0.002172,0.002150,0.003209,Professional Shot
7,0.021934,0.021994,0.021960,0.017221,0.024247,0.031919,0.025094,0.029589,0.029577,0.034109,...,0.002369,0.003085,0.002209,0.004121,0.005949,0.008426,0.005515,0.003856,0.005331,Professional Shot
8,0.021607,0.023446,0.024681,0.022594,0.020822,0.023880,0.033742,0.036366,0.028725,0.023992,...,0.004596,0.006244,0.006018,0.007221,0.008162,0.008775,0.011224,0.009084,0.008106,Professional Shot
9,0.025701,0.025206,0.026513,0.020220,0.020968,0.028469,0.026803,0.030503,0.026362,0.029458,...,0.005880,0.005066,0.006673,0.008562,0.010024,0.012716,0.011055,0.009596,0.019143,Professional Shot


## Discretization and model training. Example 1.
First, Naive Bayes with frequency discretization.

In [5]:
# we create a train and a test partition.
np.random.seed(1000)
num_folds = 2
folds = np.random.choice(range(0,num_images),replace=False,size=(num_folds,int(num_images/num_folds)))

train_indexes = folds[0]
test_indexes = folds[1]

### Train

In [6]:
data_aux.loc[train_indexes]

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,...,var92,var93,var94,var95,var96,var97,var98,var99,var100,Class
94,0.022409,0.024159,0.021843,0.026759,0.026157,0.026169,0.017016,0.018121,0.021250,0.034788,...,0.003019,0.003600,0.002648,0.009703,0.008623,0.005445,0.004732,0.005653,0.007829,Snapshot
185,0.019551,0.013689,0.018541,0.025747,0.038417,0.041971,0.028114,0.020738,0.018258,0.018981,...,0.002034,0.004788,0.004949,0.006400,0.006728,0.007268,0.004522,0.003362,0.004115,Snapshot
137,0.006819,0.007831,0.012187,0.022136,0.052152,0.077821,0.045110,0.024225,0.015012,0.011856,...,0.003789,0.004827,0.009645,0.030005,0.038927,0.016245,0.007100,0.006193,0.004881,Snapshot
114,0.019617,0.021057,0.028046,0.025630,0.024039,0.028691,0.025611,0.029819,0.024222,0.024338,...,0.007905,0.007930,0.007113,0.007707,0.008780,0.014712,0.018593,0.012903,0.010321,Professional Shot
65,0.031341,0.025241,0.029597,0.020404,0.018675,0.046740,0.023837,0.030169,0.022020,0.034552,...,0.000788,0.000615,0.000757,0.001400,0.003137,0.005309,0.009826,0.005748,0.016049,Professional Shot
206,0.028199,0.037143,0.037110,0.023698,0.028026,0.039400,0.010250,0.013455,0.015945,0.033643,...,0.000548,0.001015,0.001474,0.001396,0.000888,0.000455,0.000293,0.000442,0.013202,Professional Shot
297,0.022042,0.020248,0.033946,0.029960,0.031155,0.033730,0.024064,0.034239,0.023474,0.022281,...,0.002881,0.003553,0.003548,0.004419,0.005511,0.006234,0.009462,0.006354,0.008368,Professional Shot
32,0.016225,0.023175,0.021835,0.018803,0.017374,0.026112,0.027640,0.030784,0.029800,0.027276,...,0.013111,0.009375,0.004100,0.004204,0.002761,0.002731,0.003904,0.004137,0.004934,Professional Shot
117,0.027240,0.018284,0.017561,0.014363,0.013584,0.015796,0.018915,0.031167,0.025040,0.042970,...,0.006298,0.006707,0.007289,0.007668,0.007808,0.007124,0.008000,0.014588,0.028697,Professional Shot
266,0.021741,0.018668,0.023024,0.022296,0.023999,0.030828,0.017469,0.031586,0.027131,0.028539,...,0.002463,0.002605,0.003491,0.004232,0.004305,0.001181,0.001163,0.001624,0.002466,Professional Shot


### Test

In [None]:
data_aux.loc[test_indexes]

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,...,var92,var93,var94,var95,var96,var97,var98,var99,var100,Class
106,0.016836,0.017975,0.025265,0.024042,0.023136,0.028257,0.021234,0.019216,0.016720,0.017054,...,0.007609,0.010205,0.008213,0.010880,0.018100,0.013258,0.017306,0.005858,0.006388,Snapshot
161,0.020446,0.009912,0.013934,0.015126,0.036286,0.040411,0.021400,0.032260,0.010673,0.071236,...,0.003013,0.002748,0.001928,0.001996,0.001640,0.002171,0.002496,0.001425,0.008663,Professional Shot
98,0.016691,0.018639,0.021730,0.023579,0.023552,0.029275,0.027382,0.026985,0.022448,0.021689,...,0.007182,0.008185,0.011209,0.012250,0.011979,0.012812,0.011017,0.007915,0.006410,Professional Shot
232,0.007374,0.008974,0.022290,0.066117,0.004749,0.003487,0.003748,0.028567,0.085603,0.010885,...,0.006324,0.009736,0.014465,0.002068,0.001317,0.001941,0.009275,0.034722,0.002699,Snapshot
229,0.026782,0.022453,0.018238,0.017479,0.018979,0.018238,0.022472,0.029364,0.034349,0.040112,...,0.004388,0.003358,0.004562,0.009543,0.009433,0.006743,0.013934,0.020625,0.009039,Professional Shot
227,0.046599,0.009328,0.014800,0.016375,0.019489,0.023608,0.023000,0.022659,0.012597,0.044902,...,0.002459,0.003442,0.008758,0.008977,0.008104,0.008037,0.009431,0.004246,0.012721,Snapshot
252,0.031492,0.023189,0.026866,0.023691,0.027600,0.028513,0.021443,0.025613,0.023106,0.032710,...,0.003683,0.004015,0.004073,0.005860,0.004918,0.004214,0.004509,0.005245,0.005880,Snapshot
300,0.024037,0.017274,0.018690,0.020508,0.034531,0.033698,0.022858,0.024216,0.023805,0.039024,...,0.001129,0.001488,0.001929,0.003951,0.003622,0.002000,0.002696,0.002821,0.002456,Snapshot
86,0.020565,0.028704,0.035469,0.025625,0.020148,0.021145,0.025106,0.028031,0.022065,0.021084,...,0.005779,0.006759,0.006264,0.006857,0.005741,0.006866,0.009331,0.006521,0.006763,Professional Shot
169,0.017723,0.017082,0.018403,0.017705,0.018277,0.019907,0.024424,0.035481,0.030549,0.027630,...,0.003506,0.004823,0.004152,0.003349,0.003482,0.004963,0.005628,0.004261,0.003310,Professional Shot


In [None]:
# we define and configure the discretization method
discretization = Unsupervised_method()
discretization.frequency = True
discretization.bins = 5

# and use the train indexes for learn the cuts
discretization.train(data_aux.loc[train_indexes])

# then we applied the cuts to all the dataset
data_discrete = discretization.process(data_aux)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data[to_change] = k


In [None]:
data_discrete

In [None]:
# we can check the new 'var1' categories
data_discrete['var1'].cat.categories

In [None]:
# and the codes or values
data_discrete['var1'].cat.codes

In [None]:
# it is the turn of the Naive Bayes
# first we create the object
model = Naive_Bayes()
# and then, we train the model with train indexes
model.fit(data_discrete.loc[train_indexes])

In [None]:
# Finally, we can obtain the probabilities of the test
model.predict_probs(data_discrete.loc[test_indexes])

In [None]:
# Or the class
model.predict_class(data_discrete.loc[test_indexes])

## Discretization and model training. Example 2.
In this case, the discretization process is Fayyad_Irani and the model is a fast implementation of AODE for python.

In [None]:
# we use the same train and a test partition.
np.random.seed(1000)
num_folds = 2
folds = np.random.choice(range(0,num_images),replace=False,size=(num_folds,int(num_images/num_folds)))

train_indexes = folds[0]
test_indexes = folds[1]

In [None]:
# we define the discretization method (this method does not work, when I solved it, I update the notebook)
#discretization = MDL_method()
discretization = Unsupervised_method()
discretization.frequency = True
discretization.bins = 5

# and use the train indexes for learn the cuts
discretization.train(data_aux.loc[train_indexes])

# then we applied the cuts to all the dataset
data_discrete = discretization.process(data_aux)

In [None]:
# it is the turn of the AODE
# first we create the object
model = AODE_fast()
# and then, we train the model with train indexes
model.fit(data_discrete.loc[train_indexes])

In [None]:
# Finally, we can obtain the probabilities of the test
model.predict_probs(data_discrete.loc[test_indexes])

In [None]:
# Or the class
model.predict_class(data_discrete.loc[test_indexes])