In [1]:
import pandas as pd
from haversine import haversine
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pickle
from scipy import spatial

len_train =80 
'''
Min-Max Scaler
展平所有特征，进行放缩
'''
class MyMinMaxScaler:
    def __init__(self):
        self.xmin,self.xmax = None,None
    def fit_transform(self,train):
        self.xmin,self.xmax = np.min(train),np.max(train)
        return self.transform(train)
    def transform(self,x):
        return (x-self.xmin)/(self.xmax-self.xmin)
    def inverse_transform(self,t):
        return t*(self.xmax-self.xmin)+self.xmin

## 预处理
+ 对齐fips
+ 检查缺失值
+ 检查条数

In [2]:
common = None
for fn in ['cases.csv','static.csv','mobility.csv','range.csv']:
    df = pd.read_csv(fn)
    if common is None:
        common = set(df['fips'].unique())
    else:
        common = common & set(df['fips'].unique())
for fn in ['cases.csv','static.csv','mobility.csv','range.csv']:
    df = pd.read_csv(fn)
    df = df[df.fips.isin(common)]
    print(df.shape)
    df.to_csv(fn,index=None)

(1694, 4)
(14, 6)
(1680, 8)
(1680, 4)


## Static features
+ (14,14,2) 第1张图表示距离，第2张图表示人口特征相似性

In [3]:
df = pd.read_csv('static.csv')
df.sort_values(by=['fips'],inplace=True)
n = df.shape[0]
xs = np.zeros((n,n,2))
lat,long = df['lat'].values,df['long'].values
scaler = MinMaxScaler()
pop = scaler.fit_transform(df[['pop_density', 'tot_pop']].values)
print(pop.shape)
for i in range(n):
    for j in range(i+1,n):

        xs[i,j,0] = haversine((lat[i],long[i]),(lat[j],long[j]),'km')
        xs[j,i,0] = xs[i,j,0]

        xs[i,j,1] = 1 - spatial.distance.cosine(pop[i,:], pop[j,:])
        # print(i,j,result)
        xs[j,i,1] = xs[i,j,1]

(14, 2)


## 感染人数相关特征
+ cases比实际天数提前一天，为了确定新增人数
+ (14,120,6) = (state,time_length,features_dim)
+ 累积确诊、累积死亡、新增确诊、新增死亡、新增确诊/累积确诊、新增死亡/累积死亡

In [4]:
df = pd.read_csv('cases.csv')
df.sort_values(by=['fips','date'],inplace=True)
fips = df['fips'].unique()
n,l = len(fips), int(df.shape[0]/len(fips))
xi = np.zeros((n,l,6))

### Acc_Cases
for i,fip in enumerate(fips):
    xi[i,:,0] = df[df.fips==fip]['cases'].values
    xi[i,:,1] = df[df.fips==fip]['deaths'].values
    

xi[:,1:,2] = xi[:,1:,0]-xi[:,:-1,0]

xi[:,1:,3] = xi[:,1:,1]-xi[:,:-1,1]

xi[:,:,4] = xi[:,:,2] / xi[:,:,0]

mask = (xi[:,:,1]!=0)
xi[mask,5] = xi[mask,3] / xi[mask,1]

xi = xi[:,1:,:]

### 保存xi[:,:,2]的scaler
for i in range(6):
    scaler = MyMinMaxScaler()
    xi[:,:len_train,i] = scaler.fit_transform(xi[:,:len_train,i])
    xi[:,len_train:,i] = scaler.transform(xi[:,len_train:,i])
    if i==2:
        f = open('scaler.pickle','wb')
        pickle.dump(scaler, f)
        f.close()

## mobilty and movement range
+ (14,120,8)
+ 6 mobility+2 range
+ 现在是对同一特征下的所有站点的所有时间点进行归一化，后面考虑针对不同站点归一化

In [5]:
mob = pd.read_csv('mobility.csv')
mob.sort_values(by=['fips','date'],inplace=True)
move = pd.read_csv('range.csv')
move.sort_values(by=['fips','date'],inplace=True)

fips = mob['fips'].unique()
num_loc = len(fips)
xm = np.zeros((num_loc,int(mob.shape[0]/len(fips)),8)) # (n,T,8)

for i,fip in enumerate(fips):
    xm[i,:,:6] = mob[mob.fips==fip][['f1','f2','f3','f4','f5','f6']].values
    xm[i,:,6:] = move[move.fips==fip][['f1','f2']].values

for i in range(8):
    scaler = MyMinMaxScaler()
    xm[:,:len_train,i] = scaler.fit_transform(xm[:,:len_train,i])
    xm[:,len_train:,i] = scaler.transform(xm[:,len_train:,i])

## 封装成pickle

In [6]:
features = {'xs':xs,'xm':xm,'xi':xi}
f = open('features.pickle','wb')
pickle.dump(features, f)
f.close()