In [1]:
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import time
from pathlib import Path
import cv2
from pprint import pprint
from collections import OrderedDict




from multiprocessing import Pool, cpu_count, Queue
from scipy.cluster.vq import kmeans2
from sklearn.decomposition import LatentDirichletAllocation 
from sklearn.preprocessing import normalize
pd.options.mode.chained_assignment = None

In [2]:
def poi_count(df):
    # feature matrix
    # every row is a region, except row 0 and 1
    # row0 is on the road
    # row1 is out of boundary
    points_vec = np.zeros((159,16))
    group = df.groupby('region')
    for key, values in group:
        # key is region number
        # freq is vector of 
        freq = values.ntype.value_counts()
        for k, v in freq.items():
            points_vec[key, k] = v
    return points_vec[2:]

In [3]:
def tfidf_feature(df):
    # @param 
    # R: total region numbers
    R = 159 - 2
    points_vec = poi_count(df)
    feature_vec = (points_vec / points_vec.sum(axis=0)) * np.log10(R / np.count_nonzero(points_vec, axis=0))
    return feature_vec

In [4]:
poi_data = pd.read_csv('poi_total.csv')
a=poi_count(poi_data)
print(a)

[[ 85. 889. 383. ...  66.  36.  64.]
 [  2.   2.  10. ...   0.   4.   2.]
 [  0.   0.   0. ...   0.   0.   0.]
 ...
 [ 11. 168.  47. ...  13.   8.   9.]
 [  4.  70.  30. ...   4.   3.   3.]
 [  0.   1.   5. ...   0.   0.   1.]]


In [5]:
a.shape

(157, 16)

In [22]:
np.savetxt('out/out.txt', a, fmt="%d")

In [23]:
aa=tfidf_feature(poi_data)
print(aa*100)

[[6.05738189e-01 2.41137110e-01 1.43002671e-01 ... 6.72195812e-01
  4.80414612e-01 2.68676667e-01]
 [1.42526633e-02 5.42490687e-04 3.73375121e-03 ... 0.00000000e+00
  5.33794013e-02 8.39614584e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [7.83896480e-02 4.55692177e-02 1.75486307e-02 ... 1.32402205e-01
  1.06758803e-01 3.77826563e-02]
 [2.85053266e-02 1.89871740e-02 1.12012536e-02 ... 4.07391401e-02
  4.00345510e-02 1.25942188e-02]
 [0.00000000e+00 2.71245343e-04 1.86687561e-03 ... 0.00000000e+00
  0.00000000e+00 4.19807292e-03]]


In [26]:
np.savetxt('out/out2.txt', aa,fmt="%.8f")

出租车

In [2]:
def load_hp(src):
    src = str(src)
    try:
        print("reading from " + src)
        data = pd.read_table(src, names=['id', 'status', 'time', 'latitude', 'longitude', 'region'], 
        sep=',', engine='c', float_precision='high')
        print(src + " is sucessfully loaded!")
        return data
    except:
        print(src + " is not exist.")

In [3]:
def extract_hours(arr):
    #@param arr: '2015-04-01 14:14:53'
    #print(arr.head(10))
    #print(arr)
    partitioned = arr.time.split(' ')[1]
    hours = partitioned.split(':')[0]
    return (int(hours) - 7)

In [4]:
def transition(c_file):
    df = load_hp(c_file)
    it = df.itertuples()
    last_row = next(it)

    leave_mat = np.zeros((159, 159, 17), dtype=int)
    arrive_mat = np.zeros((159, 159, 17), dtype=int)
    al_mat = np.zeros((159, 159), dtype=int)
    for row in it:
        if row.id == last_row.id:
            if row.status == 1:
                # 0 -> 1: arrive
                (region_orig, leave_time) = (last_row.region, extract_hours(last_row))
                (region_dest, arrive_time) = (row.region, extract_hours(row))
                
                al_mat[region_orig, region_dest] += 1
                leave_mat[region_orig, region_dest, leave_time] += 1
                arrive_mat[region_dest, region_orig, arrive_time] += 1
                #pprint(leave_mat[region_orig, region_dest, leave_time])
        last_row = row
    return (leave_mat[2:, 2:, :], arrive_mat[2:, 2:, :], al_mat[2:,2:])

In [5]:
(a,b,c) = transition('data/total2.txt')


reading from data/total2.txt


  


data/total2.txt is sucessfully loaded!


In [14]:
print(c)

[[447   0   0 ...   0   5   2]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   4   2   0]
 [  2   0   0 ...   2   9   0]
 [  1   0   0 ...   0   0   2]]


In [15]:
np.savetxt('out/out3.txt', c,fmt="%.8f")

In [19]:
h=0

for i in range(157):
    for j in range(157):
        if c[i][j]!=0 and i!=j:
            h+=1
print(h)

2008


In [22]:
edge=[]
for i in range(157):
    for j in range(157):
        if c[i][j]!=0 and i!=j:
            edge.append([i,j,c[i][j]])
print(edge)

[[0, 4, 1], [0, 11, 1], [0, 14, 3], [0, 17, 1], [0, 21, 1], [0, 32, 7], [0, 34, 4], [0, 38, 1], [0, 40, 1], [0, 43, 1], [0, 45, 4], [0, 47, 1], [0, 48, 6], [0, 54, 1], [0, 55, 3], [0, 59, 9], [0, 62, 9], [0, 67, 1], [0, 69, 2], [0, 72, 3], [0, 73, 5], [0, 74, 24], [0, 76, 4], [0, 77, 20], [0, 81, 2], [0, 85, 2], [0, 86, 1], [0, 87, 1], [0, 88, 1], [0, 91, 97], [0, 93, 4], [0, 95, 58], [0, 96, 625], [0, 98, 1], [0, 103, 1], [0, 106, 7], [0, 112, 14], [0, 114, 1940], [0, 115, 458], [0, 116, 3], [0, 118, 206], [0, 119, 1], [0, 120, 29], [0, 121, 163], [0, 122, 743], [0, 123, 626], [0, 124, 419], [0, 125, 2], [0, 128, 3], [0, 129, 21], [0, 130, 146], [0, 131, 3], [0, 133, 887], [0, 135, 299], [0, 138, 333], [0, 139, 808], [0, 140, 374], [0, 142, 264], [0, 146, 2], [0, 148, 3], [0, 149, 1], [0, 150, 1], [0, 152, 11], [0, 155, 5], [0, 156, 2], [1, 96, 1], [1, 114, 1], [3, 114, 1], [5, 55, 1], [5, 120, 1], [6, 5, 1], [6, 48, 2], [6, 96, 2], [6, 114, 3], [6, 130, 2], [7, 0, 1], [7, 11, 1], [7,