In [1]:
# import functions
import pandas as pd
import numpy as np
import codecs
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler                 #normalising features
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import KFold

from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso
from sklearn.svm import SVC, SVR

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sknn import mlp

from scipy.optimize import minimize

import theano
import theano.tensor as T

%matplotlib inline

np.random.seed=10


In [2]:
# Generating population
% time
num_pop = 100     #number of population
num_date = 100    #number of days to be simulated
num_time = 24     #time intervals; 24 for hourly data, 96 for 15min data; 1440 for 1min data

hist_data = []
# pd.DataFrame(columns=['personID', 'income', 'employment', 'children', 'Date', 'time', 'zone'])


for npop in range(num_pop):
    income = np.random.normal(loc=25000, scale=5000)
    employment = np.random.randint(0, 5)  #assuming 4 types of employment; 0: full-time; 1: part-time; 2: students; 3: unemployed
    children = np.random.randint(0, 2)    #0 - no children or 1 - have children
    for ndate in range(num_date):
        for ntime in range(num_time):
            zone = np.random.randint(0,50)  #number of zones, assuming 50
            hist_data.append([npop, income, employment, children, ndate, ntime, zone])
        
hist_data = pd.DataFrame(hist_data, columns=['personID', 'income', 'employment', 'children', 'date', 'time', 'zone'])




Wall time: 0 ns


In [3]:
# making predictions

X = hist_data.drop('zone', axis=1)
y = hist_data['zone']

trainX, testX, trainY, testY = train_test_split(X, y)



In [4]:
## using KNN
# weights might need to be changed - more weights towards the recent data?
# metrics? canberra, minkowski

clf_knn = KNeighborsClassifier(n_neighbors=1).fit(trainX, trainY)
pred_knn = clf_knn.predict_proba(testX)[:, 1]
print precision_score(testY, pred_knn, average='micro')  
print recall_score(testY, pred_knn, average='micro')
print 1./50


0.01915
0.01915
0.02


In [26]:
## using Ridge

clf_rdg = RidgeClassifier(alpha=1).fit(trainX,trainY)
pred_rdg = clf_rdg.predict(testX)
print precision_score(testY, pred_rdg, average='micro')  
print recall_score(testY, pred_rdg, average='micro')


0.0193166666667
0.0193166666667


In [None]:
## using Lasso

clf_las = Lasso(alpha=1).fit(trainX,trainY)
pred_las = clf_las.predict(testX)
print precision_score(testY, pred_las, average='micro')  
print recall_score(testY, pred_las, average='micro')



In [5]:
## using RF

clf_rf = RandomForestClassifier(n_estimators=100, max_depth=10)
clf_rf.fit(trainX, trainY)
pred_rf = clf_rf.predict(testX)
print precision_score(testY, pred_rf, average='micro')  
print recall_score(testY, pred_rf, average='micro')
print 1./50

0.0198666666667
0.0198666666667
0.02


In [6]:
## using GBDT

clf_gb = GradientBoostingClassifier(n_estimators=100, max_depth=3)
clf_gb.fit(trainX, trainY)
pred_gb = clf_gb.predict(testX)
print precision_score(testY, pred_gb, average='micro')  
print recall_score(testY, pred_gb, average='micro')
print 1./50


0.0187666666667
0.0187666666667
0.02


In [None]:
## using NN

clf_nn = mlp.Classifier(layers=[mlp.Layer("Sigmoid", units=100), mlp.Layer("Softmax")], n_iter=25)
clf_nn.fit(trainX, trainY)
pred_nn = clf_nn.predict(testX)
print precision_score(testY, pred_nn, average='micro')  
print recall_score(testY, pred_nn, average='micro')

In [18]:
# neural network
def fit_predict_NN(trainX, trainY, testX, activate_functions, hidden_layers):
    X_ = theano.shared(trainX, name='X')
    y_ = theano.shared(trainY, name='y')
    param = T.vector()
    dim = X.shape[1]
    
    def activation(data_, parameter):
        n_previous = 0
        dim_previous = dim
        h = data_
        for n_hidden, func in zip(hidden_layers, activate_functions):
            N = dim_previous * n_hidden
            W_ = parameter[n_previous:n_previous + N].reshape((dim_previous, n_hidden))
            h = func(h.dot(W_))
            dim_previous = n_hidden
            n_previous += N

        # output     
        v_ = parameter[n_previous:]
        output = h.dot(v_)
        n_previous = n_previous + dim_previous
        
        return T.nnet.sigmoid(output), n_previous

    p_sig = activation(X_, param)[0]
    p_bck = 1 - p_sig
    llh_ = y_.dot(T.log(p_sig)) + (1 - y_).dot(T.log(p_bck))
    loss = -llh_
    
    # optimize
    loss_function = theano.function([param], loss)
    loss_grad = theano.function([param], theano.grad(loss, param))
    result = minimize(loss_function, jac=loss_grad, x0=numpy.random.normal(size=activation(X_, param)[1]))
    optimal_params = result['x']
    
    # predict data
    data = T.matrix()
    compiled_activation = theano.function([data, param], activation(data, param)[0])
    
    return compiled_activation(testX, optimal_params)


In [19]:
## define activation function
#sigmoid (which we used, T.nnet.sigmoid)
#leaky ReLU (defined below)
#softplus (T.nnet.softplus)

def LeakyReLU(x):
    return T.switch(x > 0, x, 0.5 * x)


In [None]:
pred = fit_predict_NN(trainX, trainY, testX, [LeakyReLU] * 3, [20, 10, 5])
print precision_score(testY, pred_gb, average='micro')  
print recall_score(testY, pred_gb, average='micro')

In [19]:
import sklearn
sklearn.__version__

'0.17'

In [4]:
import networkx as nx

G=nx.Graph()
G.add_node("spam")
G.add_edge(1,2)
print(G.nodes())
#[1, 2, 'spam']
print(G.edges())
#[(1, 2)]

[1, 2, 'spam']
[(1, 2)]


In [3]:

x = T.vector('x')
A = T.matrix('A')
z = A.dot(x)
normAx = theano.function([x, A], z.dot(z))
normAx([0, 2], [[1, 1], [1, 1]])

array(8.0)

In [1]:
from mpl_toolkits.basemap import Basemap


