# Demo PGMs

This notebook is an example of how to use the Bayesian models and the discretization processes. The dataset is a part of the AVA dataset and the features are from the phog.

## A bit of set up

We need numpy and pandas for data. Pickle and gzip for read the extracted features. Our folder with the code of our functions.

In [1]:
# set up Python environment: numpy for numerical routines
import numpy as np
import pandas as pd

# for store the results
from six.moves import cPickle as pickle
import gzip

# first, we add the folder with the code to the pythonpath
import sys
sys.path.append('../pycode/')

# we can import a file from the pycode
import utilsData
# or go inside a folder with from
from preprocess import utilities

# even we can select only the functions we need from a file
from preprocess.mdl import MDL_method
from preprocess.unsupervised import Unsupervised_method
from models.nb import Naive_Bayes
from models.aode_fast import AODE_fast

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Now the data

In [2]:
# First we load the features
features_file = '../features/AVA/PHOG/10_bins20_levels1_angle360_redux.arff'
features = utilsData.readARFF(features_file)
features['id'] = features['id'].astype(int)

# we take the name of the features and delete de ID
features_names = np.array(features.columns)
index = np.argwhere(features_names=='id')
features_names = np.delete(features_names, index)

# And now the class and other information
data = pickle.load(gzip.open('../packages/AVA_info.pklz','rb',2))

# and merge all the information by the id
data=data.merge(features, on='id', copy=False)
num_images = data.shape[0]

# to free space
del features

In [3]:
# we only need the features and the class
data_aux = data[np.append(features_names,['Class'])]

# and change the numerical class to categorical
data_aux['Class'] = pd.Categorical(data_aux['Class'],[0,1])
data_aux['Class'].cat.categories=['Snapshot','Professional Shot']

# to free space
del data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
data_aux.iloc[0:10,-10:]

Unnamed: 0,var92,var93,var94,var95,var96,var97,var98,var99,var100,Class
0,0.002261,0.002495,0.0032,0.00648,0.00555,0.00326,0.002899,0.003092,0.002915,Snapshot
1,0.006609,0.006342,0.004547,0.003864,0.003626,0.004761,0.006235,0.005235,0.024966,Snapshot
2,0.003201,0.005084,0.007924,0.016981,0.028977,0.017653,0.009368,0.006688,0.004389,Snapshot
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Professional Shot
4,0.007309,0.007081,0.003782,0.003287,0.002867,0.003312,0.003499,0.002475,0.025802,Professional Shot
5,0.004451,0.003772,0.004422,0.006955,0.007336,0.008979,0.005932,0.005805,0.016598,Professional Shot
6,0.002496,0.002764,0.001797,0.001513,0.001188,0.001903,0.002172,0.00215,0.003209,Professional Shot
7,0.002369,0.003085,0.002209,0.004121,0.005949,0.008426,0.005515,0.003856,0.005331,Professional Shot
8,0.004596,0.006244,0.006018,0.007221,0.008162,0.008775,0.011224,0.009084,0.008106,Professional Shot
9,0.00588,0.005066,0.006673,0.008562,0.010024,0.012716,0.011055,0.009596,0.019143,Professional Shot


## Discretization and model training. Example 1.
First, Naive Bayes with frequency discretization.

In [5]:
# we create a train and a test partition.
np.random.seed(1000)
num_folds = 2
folds = np.random.choice(range(0,num_images),replace=False,size=(num_folds,int(num_images/num_folds)))

train_indexes = folds[0]
test_indexes = folds[1]

### Train

In [6]:
data_aux.loc[train_indexes].iloc[0:10,-10:]

Unnamed: 0,var92,var93,var94,var95,var96,var97,var98,var99,var100,Class
94,0.003019,0.0036,0.002648,0.009703,0.008623,0.005445,0.004732,0.005653,0.007829,Snapshot
185,0.002034,0.004788,0.004949,0.0064,0.006728,0.007268,0.004522,0.003362,0.004115,Snapshot
137,0.003789,0.004827,0.009645,0.030005,0.038927,0.016245,0.0071,0.006193,0.004881,Snapshot
114,0.007905,0.00793,0.007113,0.007707,0.00878,0.014712,0.018593,0.012903,0.010321,Professional Shot
65,0.000788,0.000615,0.000757,0.0014,0.003137,0.005309,0.009826,0.005748,0.016049,Professional Shot
206,0.000548,0.001015,0.001474,0.001396,0.000888,0.000455,0.000293,0.000442,0.013202,Professional Shot
297,0.002881,0.003553,0.003548,0.004419,0.005511,0.006234,0.009462,0.006354,0.008368,Professional Shot
32,0.013111,0.009375,0.0041,0.004204,0.002761,0.002731,0.003904,0.004137,0.004934,Professional Shot
117,0.006298,0.006707,0.007289,0.007668,0.007808,0.007124,0.008,0.014588,0.028697,Professional Shot
266,0.002463,0.002605,0.003491,0.004232,0.004305,0.001181,0.001163,0.001624,0.002466,Professional Shot


### Test

In [7]:
data_aux.loc[test_indexes].iloc[0:10,-10:]

Unnamed: 0,var92,var93,var94,var95,var96,var97,var98,var99,var100,Class
106,0.007609,0.010205,0.008213,0.01088,0.0181,0.013258,0.017306,0.005858,0.006388,Snapshot
161,0.003013,0.002748,0.001928,0.001996,0.00164,0.002171,0.002496,0.001425,0.008663,Professional Shot
98,0.007182,0.008185,0.011209,0.01225,0.011979,0.012812,0.011017,0.007915,0.00641,Professional Shot
232,0.006324,0.009736,0.014465,0.002068,0.001317,0.001941,0.009275,0.034722,0.002699,Snapshot
229,0.004388,0.003358,0.004562,0.009543,0.009433,0.006743,0.013934,0.020625,0.009039,Professional Shot
227,0.002459,0.003442,0.008758,0.008977,0.008104,0.008037,0.009431,0.004246,0.012721,Snapshot
252,0.003683,0.004015,0.004073,0.00586,0.004918,0.004214,0.004509,0.005245,0.00588,Snapshot
300,0.001129,0.001488,0.001929,0.003951,0.003622,0.002,0.002696,0.002821,0.002456,Snapshot
86,0.005779,0.006759,0.006264,0.006857,0.005741,0.006866,0.009331,0.006521,0.006763,Professional Shot
169,0.003506,0.004823,0.004152,0.003349,0.003482,0.004963,0.005628,0.004261,0.00331,Professional Shot


In [8]:
# we define and configure the discretization method
discretization = Unsupervised_method()
discretization.frequency = True
discretization.bins = 5

# and use the train indexes for learn the cuts
discretization.train(data_aux.loc[train_indexes])

# then we applied the cuts to all the dataset
data_discrete = discretization.process(data_aux)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data[to_change] = k


In [9]:
data_discrete.iloc[0:10,-10:]

Unnamed: 0,var92,var93,var94,var95,var96,var97,var98,var99,var100,Class
0,"(0.0022583, 0.00353009]","(0.00215135, 0.00397196]","(0.00239402, 0.00384519]","(0.00630699, 0.00863901]","(0.00545248, 0.00752475]","(0.00305624, 0.00461917]","(-inf, 0.00375605]","(-inf, 0.00309874]","(-inf, 0.00354598]",Snapshot
1,"(0.00654613, inf)","(0.00521838, 0.00723802]","(0.00384519, 0.00507929]","(-inf, 0.00420356]","(0.00318197, 0.00545248]","(0.00461917, 0.00651079]","(0.00571873, 0.00769487]","(0.00453852, 0.00598779]","(0.012754, inf)",Snapshot
2,"(0.0022583, 0.00353009]","(0.00397196, 0.00521838]","(0.00704153, inf)","(0.0133972, inf)","(0.0107253, inf)","(0.00984114, inf)","(0.00769487, 0.0121628]","(0.00598779, 0.00914716]","(0.00354598, 0.00561724]",Snapshot
3,"(-inf, 0.0022583]","(-inf, 0.00215135]","(-inf, 0.00239402]","(-inf, 0.00420356]","(-inf, 0.00318197]","(-inf, 0.00305624]","(-inf, 0.00375605]","(-inf, 0.00309874]","(-inf, 0.00354598]",Professional Shot
4,"(0.00654613, inf)","(0.00521838, 0.00723802]","(0.00239402, 0.00384519]","(-inf, 0.00420356]","(-inf, 0.00318197]","(0.00305624, 0.00461917]","(-inf, 0.00375605]","(-inf, 0.00309874]","(0.012754, inf)",Professional Shot
5,"(0.00353009, 0.00454702]","(0.00215135, 0.00397196]","(0.00384519, 0.00507929]","(0.00630699, 0.00863901]","(0.00545248, 0.00752475]","(0.00651079, 0.00984114]","(0.00571873, 0.00769487]","(0.00453852, 0.00598779]","(0.012754, inf)",Professional Shot
6,"(0.0022583, 0.00353009]","(0.00215135, 0.00397196]","(-inf, 0.00239402]","(-inf, 0.00420356]","(-inf, 0.00318197]","(-inf, 0.00305624]","(-inf, 0.00375605]","(-inf, 0.00309874]","(-inf, 0.00354598]",Professional Shot
7,"(0.0022583, 0.00353009]","(0.00215135, 0.00397196]","(-inf, 0.00239402]","(-inf, 0.00420356]","(0.00545248, 0.00752475]","(0.00651079, 0.00984114]","(0.00375605, 0.00571873]","(0.00309874, 0.00453852]","(0.00354598, 0.00561724]",Professional Shot
8,"(0.00454702, 0.00654613]","(0.00521838, 0.00723802]","(0.00507929, 0.00704153]","(0.00630699, 0.00863901]","(0.00752475, 0.0107253]","(0.00651079, 0.00984114]","(0.00769487, 0.0121628]","(0.00598779, 0.00914716]","(0.00561724, 0.00836835]",Professional Shot
9,"(0.00454702, 0.00654613]","(0.00397196, 0.00521838]","(0.00507929, 0.00704153]","(0.00630699, 0.00863901]","(0.00752475, 0.0107253]","(0.00984114, inf)","(0.00769487, 0.0121628]","(0.00914716, inf)","(0.012754, inf)",Professional Shot


In [10]:
# we can check the new 'var1' categories
data_discrete['var1'].cat.categories

Index(['(-inf, 0.0158605]', '(0.0158605, 0.0196166]', '(0.0196166, 0.023885]',
       '(0.023885, 0.0281986]', '(0.0281986, inf)'],
      dtype='object')

In [11]:
# and the codes or values
data_discrete['var1'].cat.codes[0:10]

0    3
1    1
2    0
3    1
4    4
5    4
6    3
7    2
8    2
9    3
dtype: int8

In [12]:
# it is the turn of the Naive Bayes
# first we create the object
model = Naive_Bayes()
# and then, we train the model with train indexes
model.fit(data_discrete.loc[train_indexes])



In [13]:
# Finally, we can obtain the probabilities of the test
model.predict_probs(data_discrete.loc[test_indexes])[0:10]

array([[  9.99971227e-01,   2.87730055e-05],
       [  2.30123317e-02,   9.76987668e-01],
       [  9.99873126e-01,   1.26873511e-04],
       [  1.17086698e-02,   9.88291330e-01],
       [  5.89392507e-02,   9.41060749e-01],
       [  9.54469093e-01,   4.55309066e-02],
       [  9.99075251e-01,   9.24749013e-04],
       [  8.58678006e-04,   9.99141322e-01],
       [  9.96408736e-01,   3.59126414e-03],
       [  6.52961670e-01,   3.47038330e-01]])

In [14]:
# Or the class
model.predict_class(data_discrete.loc[test_indexes])[0:10]

[Snapshot, Professional Shot, Snapshot, Professional Shot, Professional Shot, Snapshot, Snapshot, Professional Shot, Snapshot, Snapshot]
Categories (2, object): [Snapshot, Professional Shot]

## Discretization and model training. Example 2.
In this case, the discretization process is Fayyad_Irani and the model is a fast implementation of AODE for python.

In [None]:
# we use the same train and a test partition.
np.random.seed(1000)
num_folds = 2
folds = np.random.choice(range(0,num_images),replace=False,size=(num_folds,int(num_images/num_folds)))

train_indexes = folds[0]
test_indexes = folds[1]

In [None]:
# we define the discretization method (this method does not work, when I solved it, I update the notebook)
#discretization = MDL_method()
discretization = Unsupervised_method()
discretization.frequency = True
discretization.bins = 5

# and use the train indexes for learn the cuts
discretization.train(data_aux.loc[train_indexes])

# then we applied the cuts to all the dataset
data_discrete = discretization.process(data_aux)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data[to_change] = k


In [None]:
# it is the turn of the AODE
# first we create the object
model = AODE_fast()
# and then, we train the model with train indexes
model.fit(data_discrete.loc[train_indexes])

In [None]:
# Finally, we can obtain the probabilities of the test
model.predict_probs(data_discrete.loc[test_indexes])[0:10]

In [None]:
# Or the class
model.predict_class(data_discrete.loc[test_indexes])[0:10]