In [17]:
import libpgm
from libpgm.nodedata import NodeData
from libpgm.lgbayesiannetwork import LGBayesianNetwork
from libpgm.graphskeleton import GraphSkeleton
import json

from libpgm.pgmlearner import PGMLearner

In [73]:
num_sources = 3
skeleton_file = 'skeleton.txt'
data_file = 'data.txt'

In [123]:
num_sources = 4

bins = ["d" + str(i) for i in range(16)]
sources = ["source" + str(i) for i in range(num_sources)]
factors = ["humidity", "temp", "wind_dir","wind_speed","time"]
nodes = bins + sources + factors

In [129]:
edges = []
for factor in factors:
    for source in sources:
        edges.append([factor, source])
for source in sources:
    for b in bins:
        edges.append([source, b])

In [130]:
frame = {"V": nodes, "E": edges}
with open(skeleton_file, 'w') as f:
    json.dump(frame, f)

skeleton = GraphSkeleton()
skeleton.load(skeleton_file)

In [131]:
model = LGBayesianNetwork(skeleton)

In [132]:
frame

{'E': [['humidity', 'source0'],
  ['humidity', 'source1'],
  ['humidity', 'source2'],
  ['humidity', 'source3'],
  ['temp', 'source0'],
  ['temp', 'source1'],
  ['temp', 'source2'],
  ['temp', 'source3'],
  ['wind_dir', 'source0'],
  ['wind_dir', 'source1'],
  ['wind_dir', 'source2'],
  ['wind_dir', 'source3'],
  ['wind_speed', 'source0'],
  ['wind_speed', 'source1'],
  ['wind_speed', 'source2'],
  ['wind_speed', 'source3'],
  ['time', 'source0'],
  ['time', 'source1'],
  ['time', 'source2'],
  ['time', 'source3'],
  ['source0', 'd0'],
  ['source0', 'd1'],
  ['source0', 'd2'],
  ['source0', 'd3'],
  ['source0', 'd4'],
  ['source0', 'd5'],
  ['source0', 'd6'],
  ['source0', 'd7'],
  ['source0', 'd8'],
  ['source0', 'd9'],
  ['source0', 'd10'],
  ['source0', 'd11'],
  ['source0', 'd12'],
  ['source0', 'd13'],
  ['source0', 'd14'],
  ['source0', 'd15'],
  ['source1', 'd0'],
  ['source1', 'd1'],
  ['source1', 'd2'],
  ['source1', 'd3'],
  ['source1', 'd4'],
  ['source1', 'd5'],
  ['source1

### Prepare Data

In [7]:
import pandas as pd
import numpy as np

In [89]:
p = "pickles/alliance_sub_w_lables.p"
df = pd.read_pickle(p)

In [90]:
df.columns.values

array(['Unnamed: 0', 'd', 't', 'date_x', 'no2', 'o3', 'no', 'so2', 'temp',
       'humidity', 'pm1', 'pm25', 'pm10', 'bin0', 'bin1', 'bin2', 'bin3',
       'bin4', 'bin5', 'bin6', 'bin7', 'bin8', 'bin9', 'bin10', 'bin11',
       'bin12', 'bin13', 'bin14', 'bin15', 'd0', 'd1', 'd2', 'd3', 'd4',
       'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14',
       'd15', 'dt', 'date_y', 'nsec', 'altm_(hpa)', 'temp (degree_c)',
       'dew (degree_c)', 'rh_(percentage)', 'dir_(degrees)', 'spd_(m/s)',
       'vis (km)', 'clouds (====)', 'labels'], dtype=object)

In [91]:
nodes = bins + ['temp','humidity',"dir_(degrees)","spd_(m/s)","t","labels"]

In [92]:
df = df[nodes]
df.rename(columns={'dir_(degrees)': 'wind_dir', 'spd_(m/s)': 'wind_speed', 't':'time'}, inplace=True)

In [93]:
df['time'] = pd.to_datetime(df['time'])
df['time'] = df['time'].dt.hour

In [72]:
df.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d11,d12,d13,d14,d15,temp,humidity,wind_dir,wind_speed,time
0,2444.13604,1221.033005,695.053277,193.078399,77.625433,118.544298,6.455696,8.003923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,16.64,85.37,320,3,22
1,2286.872595,1127.107389,602.379507,201.85469,0.0,50.804699,12.911392,8.003923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,16.64,84.88,350,2,22
2,2057.530071,976.826404,630.181638,114.091781,44.35739,50.804699,6.455696,24.011768,20.637702,0.0,...,0.0,0.0,0.0,0.0,0.0,16.7,84.57,30,3,23
3,1933.029844,932.99445,871.13344,236.959853,99.804128,67.739599,25.822785,8.003923,10.318851,8.776291,...,0.0,0.0,0.0,0.0,0.0,16.62,85.15,10,3,23
4,1821.634904,1102.060558,750.657539,298.393889,33.268043,42.337249,38.734177,8.003923,10.318851,8.776291,...,0.0,0.0,0.0,0.0,0.0,16.74,85.37,20,2,0


In [94]:
df.columns.values

array(['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10',
       'd11', 'd12', 'd13', 'd14', 'd15', 'temp', 'humidity', 'wind_dir',
       'wind_speed', 'time', 'labels'], dtype=object)

In [97]:
label = pd.get_dummies( df['labels'] )

In [114]:
label.rename(columns={0: 'source0', 1: 'source1', 2:'source2',3:'source3'}, inplace=True)

In [116]:
df_w_labels = pd.concat([df,label], axis = 1)
df_w_labels.drop('labels',axis=1,inplace=True)

In [117]:
df_w_labels.columns.values

array(['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10',
       'd11', 'd12', 'd13', 'd14', 'd15', 'temp', 'humidity', 'wind_dir',
       'wind_speed', 'time', 'source0', 'source1', 'source2', 'source3'], dtype=object)

In [118]:
data = df_w_labels.to_dict(orient='records')

In [119]:
with open(data_file,'w') as f:
    json.dump(data,f)

In [120]:
isinstance(data[1], dict)

True

In [121]:
data_u = json.load(open(data_file))

### Learning

In [133]:
# instantiate my learner 
learner = PGMLearner()
# estimate parameters
result = learner.lg_mle_estimateparams(skeleton, data_u)

# output
print json.dumps(result.Vdata, indent=2)

{
  "d8": {
    "mean_base": 3880.0, 
    "parents": [
      "source0", 
      "source1", 
      "source2", 
      "source3"
    ], 
    "children": [], 
    "mean_scal": [
      -3865.458370352459, 
      -3866.754907467015, 
      -3865.4210726740366, 
      -3866.6074484951914
    ], 
    "variance": 958.4270722270012
  }, 
  "d9": {
    "mean_base": 712.0, 
    "parents": [
      "source0", 
      "source1", 
      "source2", 
      "source3"
    ], 
    "children": [], 
    "mean_scal": [
      -702.7562588160027, 
      -703.7476668145672, 
      -703.465258440789, 
      -704.6055081362962
    ], 
    "variance": 1216.0906851149193
  }, 
  "d6": {
    "mean_base": -2192.0, 
    "parents": [
      "source0", 
      "source1", 
      "source2", 
      "source3"
    ], 
    "children": [], 
    "mean_scal": [
      2229.3666564798114, 
      2227.36179878406, 
      2229.0166068555413, 
      2228.7232967147097
    ], 
    "variance": 1029.2530869357288
  }, 
  "d7": {
    "mean_ba

### ---- Code below is useless (at least for now) ----

In [39]:
len(data)

7056

In [46]:
#%%timeit
result = learner.lg_estimatebn(data)

In [35]:
result

<libpgm.lgbayesiannetwork.LGBayesianNetwork at 0x110f11810>

In [42]:
%%timeit
result_constraint = learner.lg_constraint_estimatestruct(data[:100])

1 loop, best of 3: 2.74 s per loop


In [44]:
result_constraint = learner.lg_constraint_estimatestruct(data[:100])

In [48]:
print json.dumps(result.E, indent=2)
print json.dumps(result.Vdata, indent=2)

[
  [
    "d10", 
    "d8"
  ], 
  [
    "d2", 
    "d1"
  ], 
  [
    "wind_speed", 
    "wind_dir"
  ], 
  [
    "wind_dir", 
    "time"
  ], 
  [
    "d6", 
    "time"
  ], 
  [
    "d8", 
    "d6"
  ], 
  [
    "wind_dir", 
    "d6"
  ], 
  [
    "wind_dir", 
    "temp"
  ]
]
{
  "d14": {
    "mean_base": 1.0972865072590714, 
    "parents": [], 
    "children": [], 
    "mean_scal": [], 
    "variance": 22.52140812946028
  }, 
  "d15": {
    "mean_base": 1.922755763732985, 
    "parents": [], 
    "children": [], 
    "mean_scal": [], 
    "variance": 59.93095201795428
  }, 
  "d10": {
    "mean_base": 14.732362519755707, 
    "parents": [], 
    "children": [
      "d8"
    ], 
    "mean_scal": [], 
    "variance": 396.4314151574552
  }, 
  "d11": {
    "mean_base": 7.948264316417576, 
    "parents": [], 
    "children": [], 
    "mean_scal": [], 
    "variance": 164.4655307441802
  }, 
  "d12": {
    "mean_base": 3.994967587148667, 
    "parents": [], 
    "children": [], 
    "m

In [47]:
print json.dumps(result_constraint.E, indent=2)
print json.dumps(result_constraint.Vdata, indent=2)

[
  [
    "humidity", 
    "time"
  ], 
  [
    "d2", 
    "d1"
  ]
]


AttributeError: 'GraphSkeleton' object has no attribute 'Vdata'

In [53]:
from sklearn.covariance import EmpiricalCovariance

In [54]:
p = "pickles/alliance_sub.p"
df = pd.read_pickle(p)
features = ['no2','o3','no','so2','temp', 'humidity', 'd0', 'd1', 'd2', 'd3', 'd4','d5', 'd6', 'd7',\
            'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15',"dir_(degrees)","spd_(m/s)"]
df = df[features]

In [57]:
cov = EmpiricalCovariance()
cov_fitted = cov.fit(df)

In [58]:
cov_fitted.covariance_

array([[  6.56288197e+01,  -4.90020863e+01,  -1.78670807e+01,
         -5.39272313e+00,   5.04957743e-01,  -9.65461845e+00,
          6.88321516e+02,   5.45507407e+02,   5.29709697e+02,
          1.56243187e+02,   4.15368448e+01,   2.90246376e+01,
          2.72483639e+01,   2.01407106e+01,   1.25499170e+01,
          6.96212804e+00,   4.92975405e+00,   1.48912927e+00,
          6.15867598e-02,  -1.80872747e+00,  -1.59677802e+00,
         -5.77529002e+00,  -1.57269311e+02,   6.63980729e+00],
       [ -4.90020863e+01,   4.23775326e+01,   1.90034767e+01,
          7.49531833e-01,  -3.38959951e+00,   1.77263515e+01,
         -3.78497613e+02,  -3.49274901e+02,  -3.75348154e+02,
         -1.09749726e+02,  -2.66145950e+01,  -1.65256296e+01,
         -2.04342839e+01,  -1.66773342e+01,  -1.01632924e+01,
         -4.96423706e+00,  -2.93501333e+00,  -3.45607458e-02,
          8.76009091e-01,   2.42718830e+00,   1.96717160e+00,
          7.29559545e+00,   1.20511321e+02,  -5.54036003e+00],
      