In [1]:
import libpgm
from libpgm.nodedata import NodeData
from libpgm.lgbayesiannetwork import LGBayesianNetwork
from libpgm.graphskeleton import GraphSkeleton
import json

from libpgm.pgmlearner import PGMLearner

In [2]:
num_sources = 3
skeleton_file = 'skeleton.txt'
data_file = 'data.txt'

In [3]:
num_sources = 4

bins = ["d" + str(i) for i in range(16)]
sources = ["source" + str(i) for i in range(num_sources)]
factors = ["humidity", "temp", "wind_dir","wind_speed","offset"]#,"time"]
nodes = bins + sources + factors

In [4]:
edges = []
for factor in factors:
    for source in sources:
        edges.append([factor, source])
for source in sources:
    for b in bins:
        edges.append([source, b])

In [5]:
frame = {"V": nodes, "E": edges}
with open(skeleton_file, 'w') as f:
    json.dump(frame, f)

skeleton = GraphSkeleton()
skeleton.load(skeleton_file)

In [6]:
model = LGBayesianNetwork(skeleton)

In [7]:
frame

{'E': [['humidity', 'source0'],
  ['humidity', 'source1'],
  ['humidity', 'source2'],
  ['humidity', 'source3'],
  ['temp', 'source0'],
  ['temp', 'source1'],
  ['temp', 'source2'],
  ['temp', 'source3'],
  ['wind_dir', 'source0'],
  ['wind_dir', 'source1'],
  ['wind_dir', 'source2'],
  ['wind_dir', 'source3'],
  ['wind_speed', 'source0'],
  ['wind_speed', 'source1'],
  ['wind_speed', 'source2'],
  ['wind_speed', 'source3'],
  ['offset', 'source0'],
  ['offset', 'source1'],
  ['offset', 'source2'],
  ['offset', 'source3'],
  ['source0', 'd0'],
  ['source0', 'd1'],
  ['source0', 'd2'],
  ['source0', 'd3'],
  ['source0', 'd4'],
  ['source0', 'd5'],
  ['source0', 'd6'],
  ['source0', 'd7'],
  ['source0', 'd8'],
  ['source0', 'd9'],
  ['source0', 'd10'],
  ['source0', 'd11'],
  ['source0', 'd12'],
  ['source0', 'd13'],
  ['source0', 'd14'],
  ['source0', 'd15'],
  ['source1', 'd0'],
  ['source1', 'd1'],
  ['source1', 'd2'],
  ['source1', 'd3'],
  ['source1', 'd4'],
  ['source1', 'd5'],
  [

### Prepare Data

In [8]:
import pandas as pd
import numpy as np

In [9]:
#p = "pickles/alliance_sub_w_lables.p"
p = "something.p"
df = pd.read_pickle(p)

In [10]:
df.columns.values

array(['Unnamed: 0', 'd', 't', 'date_x', 'no2', 'o3', 'no', 'so2', 'temp',
       'humidity', 'pm1', 'pm25', 'pm10', 'bin0', 'bin1', 'bin2', 'bin3',
       'bin4', 'bin5', 'bin6', 'bin7', 'bin8', 'bin9', 'bin10', 'bin11',
       'bin12', 'bin13', 'bin14', 'bin15', 'd0', 'd1', 'd2', 'd3', 'd4',
       'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14',
       'd15', 'dt', 'date_y', 'nsec', 'altm_(hpa)', 'temp (degree_c)',
       'dew (degree_c)', 'rh_(percentage)', 'dir_(degrees)', 'spd_(m/s)',
       'vis (km)', 'clouds (====)', 'labels'], dtype=object)

In [11]:
df['labels'].value_counts()

2    4102
0    1001
3     742
1     425
Name: labels, dtype: int64

In [12]:
df.rename(columns={'dir_(degrees)': 'wind_dir', 'spd_(m/s)': 'wind_speed'}, inplace=True)

In [13]:
features = ['temp','humidity',"wind_dir","wind_speed","labels"]
nodes = bins + features

In [14]:
df = df[nodes]

In [15]:
#df['time'] = pd.to_datetime(df['time'])
#df['time'] = df['time'].dt.hour

In [16]:
maybe = df[bins].applymap(lambda x: (0 if x <= 0 else np.log(x)))
feat = df[features]
logged = pd.concat([maybe,feat], axis = 1)

In [17]:
df = logged

In [18]:
df.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d11,d12,d13,d14,d15,temp,humidity,wind_dir,wind_speed,labels
0,7.801447,7.107453,6.543989,5.263096,4.351895,4.775287,1.864963,2.079932,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,16.64,85.37,320,3,3
1,7.73494,7.02741,6.400888,5.307548,0.0,3.927989,2.55811,2.079932,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,16.64,84.88,350,2,2
2,7.629262,6.884309,6.446008,4.737003,3.792279,3.927989,1.864963,3.178544,3.02712,0.0,...,0.0,0.0,0.0,0.0,0.0,16.7,84.57,30,3,0
3,7.566844,6.838399,6.769795,5.467891,4.60321,4.215671,3.251257,2.079932,2.333972,2.172054,...,0.0,0.0,0.0,0.0,0.0,16.62,85.15,10,3,2
4,7.50749,7.004937,6.62095,5.698414,3.504597,3.745667,3.656722,2.079932,2.333972,2.172054,...,0.0,0.0,0.0,0.0,0.0,16.74,85.37,20,2,2


In [19]:
df.columns.values

array(['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10',
       'd11', 'd12', 'd13', 'd14', 'd15', 'temp', 'humidity', 'wind_dir',
       'wind_speed', 'labels'], dtype=object)

In [20]:
label = pd.get_dummies( df['labels'] )

In [21]:
label.rename(columns={0: 'source0', 1: 'source1', 2:'source2',3:'source3'}, inplace=True)

In [22]:
df_w_labels = pd.concat([df,label], axis = 1)
df_w_labels.drop('labels',axis=1,inplace=True)

In [35]:
#df_w_labels['offset'] = pd.Series([1]*len(df_w_labels))
df_w_labels['offset'] = np.random.normal(1, 0.01, len(df_w_labels))

In [24]:
df_w_labels.columns.values

array(['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10',
       'd11', 'd12', 'd13', 'd14', 'd15', 'temp', 'humidity', 'wind_dir',
       'wind_speed', 'source0', 'source1', 'source2', 'source3', 'offset'], dtype=object)

In [36]:
df_w_labels.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d15,temp,humidity,wind_dir,wind_speed,source0,source1,source2,source3,offset
0,7.801447,7.107453,6.543989,5.263096,4.351895,4.775287,1.864963,2.079932,0.0,0.0,...,0.0,16.64,85.37,320,3,0,0,0,1,0.995781
1,7.73494,7.02741,6.400888,5.307548,0.0,3.927989,2.55811,2.079932,0.0,0.0,...,0.0,16.64,84.88,350,2,0,0,1,0,0.993525
2,7.629262,6.884309,6.446008,4.737003,3.792279,3.927989,1.864963,3.178544,3.02712,0.0,...,0.0,16.7,84.57,30,3,1,0,0,0,1.011963
3,7.566844,6.838399,6.769795,5.467891,4.60321,4.215671,3.251257,2.079932,2.333972,2.172054,...,0.0,16.62,85.15,10,3,0,0,1,0,0.986913
4,7.50749,7.004937,6.62095,5.698414,3.504597,3.745667,3.656722,2.079932,2.333972,2.172054,...,0.0,16.74,85.37,20,2,0,0,1,0,0.99767


In [37]:
data = df_w_labels.to_dict(orient='records')

In [38]:
new_data_file = "data5.txt"

In [39]:
with open(new_data_file,'w') as f:
    json.dump(data,f)

In [40]:
isinstance(data[1], dict)

True

In [41]:
data_u = json.load(open(new_data_file))

### Learning

In [31]:
# instantiate my learner 
#learner2 = PGMLearner()
# estimate parameters
#structure = learner.lg_estimatebn(data_u)
#print structure

In [32]:
#structure.Vdata

In [42]:
# LOG DATA
learner = PGMLearner()
# estimate parameters
result = learner.lg_mle_estimateparams(skeleton, data_u)

# output
print json.dumps(result.Vdata, indent=2)

{
  "d8": {
    "mean_base": 1926.0, 
    "parents": [
      "source0", 
      "source1", 
      "source2", 
      "source3"
    ], 
    "children": [], 
    "mean_scal": [
      -1924.2364176619863, 
      -1924.0245803556911, 
      -1924.137338714772, 
      -1924.197151506957
    ], 
    "variance": 2.100732118822634
  }, 
  "d9": {
    "mean_base": 214.0, 
    "parents": [
      "source0", 
      "source1", 
      "source2", 
      "source3"
    ], 
    "children": [], 
    "mean_scal": [
      -212.76196946272768, 
      -212.50818023311317, 
      -212.6694267542241, 
      -212.74524370420343
    ], 
    "variance": 1.7888902031181715
  }, 
  "d6": {
    "mean_base": -1897.0, 
    "parents": [
      "source0", 
      "source1", 
      "source2", 
      "source3"
    ], 
    "children": [], 
    "mean_scal": [
      1900.1667878616524, 
      1900.5002355526283, 
      1900.2828485097357, 
      1900.2003840778675
    ], 
    "variance": 0.9658190230838954
  }, 
  "d7": {
    "m

In [34]:
#not log
# instantiate my learner 
learner = PGMLearner()
# estimate parameters
result = learner.lg_mle_estimateparams(skeleton, data_u)

# output
print json.dumps(result.Vdata, indent=2)

LinAlgError: Singular matrix

### ---- Code below is useless (at least for now) ----

In [None]:
len(data)

In [None]:
#%%timeit
result = learner.lg_estimatebn(data)

In [None]:
result

In [None]:
%%timeit
result_constraint = learner.lg_constraint_estimatestruct(data[:100])

In [None]:
result_constraint = learner.lg_constraint_estimatestruct(data[:100])

In [None]:
print json.dumps(result.E, indent=2)
print json.dumps(result.Vdata, indent=2)

In [None]:
print json.dumps(result_constraint.E, indent=2)
print json.dumps(result_constraint.Vdata, indent=2)

In [None]:
from sklearn.covariance import EmpiricalCovariance

In [None]:
p = "pickles/alliance_sub.p"
df = pd.read_pickle(p)
features = ['no2','o3','no','so2','temp', 'humidity', 'd0', 'd1', 'd2', 'd3', 'd4','d5', 'd6', 'd7',\
            'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15',"dir_(degrees)","spd_(m/s)"]
df = df[features]

In [None]:
cov = EmpiricalCovariance()
cov_fitted = cov.fit(df)

In [None]:
cov_fitted.covariance_