In [1]:
import pandas as pd
import numpy as np
from pgmpy.estimators import ExhaustiveSearch, HillClimbSearch, BdeuScore, K2Score, BicScore, MaximumLikelihoodEstimator, BayesianEstimator
from pgmpy.models import BayesianModel
from scipy import stats
import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['figure.figsize'] = (10.0, 8.0)


In [2]:
data = pd.read_csv('data/processed/bus_network_data.csv')
features = [u'bus_line', u'direction',
            #u'date',
            u'month',
            #u'day',
            u'day_of_week',
            u'time_period',
            #u'hour',
            #u'minute',
            #u'trip_time', 
            #u'avg_trip_time',
            #u'std_trip_time', 
            #u'delay_time',
            u'delay', #this is our Y
            u'Conditions', 
            #u'Humidity',
            #u'PrecipitationIn',
            #u'TemperatureF',
            #u'VisibilityMPH',
            #u'Wind SpeedMPH',
            u'totalInjuries', # cur for networks pd.cut(5)
            u'pavementScore', #
            u'potholeCount', # 
            #u'prev_trip_ratio',
            u'ntwk_delay_lag1hr']

data = data.loc[:,features]
data.dropna(inplace = True)

In [3]:
data = data.sample(frac=0.1, replace=False)
data.shape

(78545, 11)

In [4]:
data.totalInjuries = pd.cut(data.totalInjuries,3,labels=False)
data.pavementScore = pd.cut(data.pavementScore,3,labels=False)
data.potholeCount = pd.cut(data.potholeCount,3,labels=False)
#data.prev_trip_ratio = pd.cut(data.prev_trip_ratio,5,labels=False)


In [None]:
data.head()

# Naive Bayes

In [None]:
#convert data to Naive Bayes compatible
#factor variables
dataNB = data.copy()
dataNB['bus_lineCat'] = pd.Categorical(values=dataNB.bus_line,categories=dataNB.bus_line.unique(),ordered=False).codes
dataNB['day_of_weekCat'] = pd.Categorical(values=dataNB.day_of_week,categories=dataNB.day_of_week.unique(),ordered=False).codes
dataNB['time_periodCat'] = pd.Categorical(values=dataNB.time_period,categories=dataNB.time_period.unique(),ordered=False).codes
dataNB['ConditionsCat'] = pd.Categorical(values=dataNB.Conditions,categories=dataNB.Conditions.unique(),ordered=False).codes

dataNB['delayBinom'] = map(int,(dataNB.delay > 0))
dataNB['direction'] = map(int,dataNB.direction)
dataNB['ntwk_delay_lag1hr'] = map(int,dataNB.ntwk_delay_lag1hr)

dataNB.head()


In [None]:
dataNB.dtypes

In [None]:
#split dataset into 60% training and 40% test 
np.random.seed(2015)
variables = ['bus_lineCat',u'direction',
             u'month', u'totalInjuries',
             u'pavementScore', u'potholeCount', u'ntwk_delay_lag1hr',
             'day_of_weekCat', 'time_periodCat', 'ConditionsCat']

ind=stats.bernoulli.rvs(p = 0.5, size = len(data.index))

X_train=dataNB.loc[ind==1,variables].values

X_test=dataNB.loc[ind==0,variables].values

Y_train = dataNB.loc[ind==1,['delay']].values
Y_trainBinom = dataNB.loc[ind==1,['delayBinom']].values


In [None]:
#its predicting all 0s

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

modelBinom = gnb.fit(X_train,Y_trainBinom)
modelBinom.predict(X_test).sum()


In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, Y_train)
clf.predict(X_test).sum()

# Bayesian network

In [6]:
data['weekend'] = ((data.day_of_week == 'Saturday') | (data.day_of_week == 'Saturday'))
data.weekend = data.weekend.map(lambda x: int(x))

data['badConditions'] = data.Conditions.isin(['Heavy Rain','Snow','Heavy Snow '])
data.badConditions = data.badConditions.map(lambda x: int(x))

data.drop(['day_of_week','Conditions'],axis=1,inplace=True)

In [8]:
bayNet = []
for variable in  data.columns:
    if variable != 'delay':
        tupla = (variable,'delay')
        bayNet.append(tupla)
bayNet

[(u'bus_line', 'delay'),
 (u'direction', 'delay'),
 (u'month', 'delay'),
 (u'time_period', 'delay'),
 (u'totalInjuries', 'delay'),
 (u'pavementScore', 'delay'),
 (u'potholeCount', 'delay'),
 (u'ntwk_delay_lag1hr', 'delay'),
 ('weekend', 'delay'),
 ('badConditions', 'delay')]

In [9]:
model = BayesianModel(bayNet)
model


<pgmpy.models.BayesianModel.BayesianModel at 0x7f1c10f1e0d0>

In [11]:
# Learing CPDs using Maximum Likelihood Estimators
model.fit(data, estimator=MaximumLikelihoodEstimator)


In [12]:
for cpd in model.get_cpds():
    print("CPD of {variable}:".format(variable=cpd.variable))
    print(cpd)
print model.get_independencies()

CPD of badConditions:
+------------------+------------+
| badConditions(0) | 0.995531   |
+------------------+------------+
| badConditions(1) | 0.00446878 |
+------------------+------------+CPD of potholeCount:
+-----------------+----------+
| potholeCount(0) | 0.336597 |
+-----------------+----------+
| potholeCount(1) | 0.512776 |
+-----------------+----------+
| potholeCount(2) | 0.150627 |
+-----------------+----------+
CPD of time_period:
+---------------------+-----------+
| time_period(MidDay) | 0.329213  |
+---------------------+-----------+
| time_period(Night)  | 0.376358  |
+---------------------+-----------+
| time_period(PeakAM) | 0.0676046 |
+---------------------+-----------+
| time_period(PeakPM) | 0.226825  |
+---------------------+-----------+
CPD of totalInjuries:
+------------------+----------+
| totalInjuries(0) | 0.328691 |
+------------------+----------+
| totalInjuries(1) | 0.460017 |
+------------------+----------+
| totalInjuries(2) | 0.211293 |
+------------

# Learn structure from data

In [None]:
#hc = HillClimbSearch(data, scoring_method=BicScore(data))
#best_model = hc.estimate()
#print(best_model.edges())

In [None]:
bayNet20 = [(u'direction', u'delay'), (u'bus_line', u'delay'), (u'bus_line', u'pavementScore'), (u'bus_line', u'direction'), (u'bus_line', u'totalInjuries'), (u'day_of_week', u'time_period'), (u'day_of_week', u'Conditions'), (u'day_of_week', u'ntwk_delay_lag1hr'), (u'day_of_week', u'month'), (u'month', u'totalInjuries'), (u'delay', u'ntwk_delay_lag1hr'), (u'potholeCount', u'bus_line'), (u'time_period', u'potholeCount'), (u'time_period', u'direction'), (u'time_period', u'Conditions'), (u'time_period', u'ntwk_delay_lag1hr'), (u'Conditions', u'month')]
bayNet40 = [(u'pavementScore', u'bus_line'), (u'bus_line', u'potholeCount'), (u'bus_line', u'delay'), (u'bus_line', u'direction'), (u'bus_line', u'ntwk_delay_lag1hr'), (u'bus_line', u'totalInjuries'), (u'day_of_week', u'time_period'), (u'day_of_week', u'Conditions'), (u'month', u'time_period'), (u'month', u'Conditions'), (u'month', u'day_of_week'), (u'month', u'totalInjuries'), (u'delay', u'direction'), (u'delay', u'ntwk_delay_lag1hr'), (u'time_period', u'direction'), (u'time_period', u'Conditions'), (u'ntwk_delay_lag1hr', u'time_period'), (u'ntwk_delay_lag1hr', u'day_of_week'), (u'ntwk_delay_lag1hr', u'month')]


# Ploting

In [None]:
#https://networkx.readthedocs.io/en/stable/tutorial/index.html
G=nx.DiGraph()
G.add_edges_from(bayNet20)
print(list(G.nodes()))
print(list(G.edges()))

In [None]:
pos = nx.spectral_layout(G)
nx.draw_networkx_nodes(G, pos,node_color='blue',node_size =3000,nodelist=G.nodes())
nx.draw_networkx_labels(G,pos=pos)
nx.draw_networkx_edges(G,pos=pos,arrows=True)
plt.axis('off')