In [139]:
from mongoengine import *
import os
import pandas as pd
import numpy as np
import re
from datetime import datetime

使用** mongoengine **  
[connect](http://docs.mongoengine.org/guide/connecting.html#guide-connecting)进行数据库连接

In [140]:
connect('typhoon')

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

## 1 读取文件

### 读取方式1：使用pandas

In [141]:
targetfilename=r"CH2017BST.txt"
# mac 16
# targetpath=r"/Users/casablanca/03project/typhoonSearchSys/demo_data"
# mac 15
targetpath=r"/Users/liusihan/Documents/01project/TyphoonSearchSys/demo_data"
fullname=os.path.join(targetpath,targetfilename)

In [142]:
fullname

'/Users/liusihan/Documents/01project/TyphoonSearchSys/demo_data/CH2017BST.txt'

不使用pandas的read_table

In [143]:
with open(fullname,'rb') as f:
    data=pd.read_table(f,sep='\s+',encoding='utf-8',header=None,infer_datetime_format=False)
    print('读取成功')

读取成功


In [144]:
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,66666,0,25,0,1,0,6.0,(nameless),20180501.0
1,2017041406,0,109,1302,1010,10,,,
2,2017041412,0,107,1290,1010,10,,,


In [145]:
len(data)

857

In [146]:
data.iloc[0]

0          66666
1              0
2             25
3              0
4              1
5              0
6              6
7     (nameless)
8    2.01805e+07
Name: 0, dtype: object

--------

## 2 判断标志位的方式（必须同时满足），并获取标志位所在位置的数组（mark_indexs）

1-第一位是否为66666

In [13]:
data.iloc[0][0]==66666

True

2- 第7位是否不为Nan

In [14]:
data.iloc[0][7]!=np.nan

True

In [15]:
data.iloc[0][0]==66666 and data.iloc[0][7]!=np.nan

True

### 写成方法：

In [147]:
def checkMark(index):
    '''
        判断是否为标志位
    '''
    return data.iloc[index][0]==66666 and data.iloc[index][7]!=np.nan

找到标志位所在的位置

In [148]:
mark_indexs=[]
for i in range(len(data)):
#     print(i)
    if checkMark(i):
        mark_indexs.append(i)

In [149]:
# mark_indexs

In [150]:
mark_indexs[:5]

[0, 26, 54, 76, 98]

--------

## 3 获取观测值并写入mongoDB（暂时先写在list中）

In [20]:
mark_indexs[:3]

[0, 26, 54]

In [21]:
mark_indexs[1:3]

[26, 54]

In [151]:
# 保存起止数组的数组
list_startend=[]
index=0
for val in mark_indexs:
#     list_startend.append(mark_indexs[index:2])
#     print(mark_indexs[index:index+2])
    list_startend.append(mark_indexs[index:index+2])
    index=index+1
#     print(index)

In [23]:
list_startend[-2:]

[[789, 830], [830]]

[[789, 830], [830]]  
注意最后的数组是长度为1的数组，因为最后一次出现标志位后出现的都是观测数据，便结束了

In [24]:
list_startend[0:3]

[[0, 26], [26, 54], [54, 76]]

In [25]:
list_realdata=[]

** 获取第1行-25行，0-6列的数据 **

In [26]:
data.iloc[1:25,0:6].head()

Unnamed: 0,0,1,2,3,4,5
1,2017041406,0,109,1302,1010,10
2,2017041412,0,107,1290,1010,10
3,2017041418,0,107,1280,1010,10
4,2017041500,1,109,1272,1008,13
5,2017041506,1,111,1263,1008,13


In [27]:
data.iloc[1:25,0:6].iloc[1]

0    2017041412
1             0
2           107
3          1290
4          1010
5            10
Name: 2, dtype: int64

In [28]:
list_realdata.append(data.iloc[1:25,0:6])

In [29]:
# list_realdata

----

## 4 使用mongoEngine

In [152]:
from mongoengine import *

In [153]:
connect('typhoon')

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

In [32]:
# class typh

定义 mongodb中的存储对象（ORM）

In [155]:
class Point(EmbeddedDocument):
    '''
        点（经纬度）
    '''
    lat=FloatField()
    lon=FloatField()
#     class my_metaclass:
#         allow_inheritance=True

class GeoTyphoonRealData(Document):
    '''
        支持geojson的存储至mongodb的model
    '''

    code=StringField(max_length=10)
    date=DateTimeField()
    bp=FloatField()
    wsm=FloatField()
    # 注意此处与django中的类型不同，django的类型为IntegerField，mongoengine为IntField！
    level=IntField()
    # latlon=models.ForeignKey(Point,on_delete=models.CASCADE)
    latlon=EmbeddedDocumentField('Point')
#     meta = {'db_alias': 'Typhoon_geotyphoonrealdata'}
#     meta = {'collection': 'geotyphoonrealdata'}
    meta = {'collection': 'Typhoon_geotyphoonrealdata'}
    # object=MongoDBManager()

In [156]:
import datetime

In [35]:
typhoon_temp=GeoTyphoonRealData(code="",
                                date=datetime.datetime.now(),
                                bp=1010.9,
                                wsm=5.4,
                                latlon=Point(lat=18.2,lon=97.9))

In [36]:
typhoon_temp.save()

<GeoTyphoonRealData: GeoTyphoonRealData object>

** 提示错误如下 **:  
NotUniqueError: Tried to save duplicate unique keys (E11000 duplicate key error collection: typhoon.Typhoon_geotyphoonrealdata index: __primary_key__ dup key: { : null })  
是mongodb中的对应document中可能设置了__primary_key__，我的解决办法是 ** 删除掉 ** 

### S1 根据标志位遍历df

循环遍历list_startend

In [157]:
for val in list_startend:
    if len(val)>1:        
        print(f"起始{val[0]}——终止{val[1]-1}")
    elif len(val)==1:
        print(f"起始{val[0]}")

起始0——终止25
起始26——终止53
起始54——终止75
起始76——终止97
起始98——终止113
起始114——终止196
起始197——终止230
起始231——终止245
起始246——终止280
起始281——终止312
起始313——终止346
起始347——终止377
起始378——终止405
起始406——终止432
起始433——终止454
起始455——终止495
起始496——终止518
起始519——终止535
起始536——终止589
起始590——终止612
起始613——终止626
起始627——终止636
起始637——终止662
起始663——终止698
起始699——终止729
起始730——终止751
起始752——终止775
起始776——终止788
起始789——终止829
起始830


In [38]:
list_startend[0]

[0, 26]

In [39]:
len(list_startend[0])

2

In [135]:
# data.iloc[500:None]

In [158]:
def getHeaderBody(start,end=None):
    '''
        根据df的起止位置截取其中的头部（66666	0	25	0	1	0	6.0	(nameless)	20180501.0），
        以及数据内容body（2017041406	0	109	1302	1010	10）
    '''
    header=data.iloc[start-1]
    body=data.iloc[start:end,0:6]
    return header,body

In [41]:
header,body=getHeaderBody(1,25)

In [42]:
header

0          66666
1              0
2             25
3              0
4              1
5              0
6              6
7     (nameless)
8    2.01805e+07
Name: 0, dtype: object

In [43]:
body.head(3)

Unnamed: 0,0,1,2,3,4,5
1,2017041406,0,109,1302,1010,10
2,2017041412,0,107,1290,1010,10
3,2017041418,0,107,1280,1010,10


In [44]:
len(body)

24

In [159]:
from datetime import datetime

In [160]:
def convert2Typhoon(obj,code):
    '''
        根据传入的series将其转为typhoob Model
    '''
    lat=float(f"{str(obj[2])[:-1]}.{str(obj[2])[-1:]}")
    lon=float(f"{str(obj[3])[:-1]}.{str(obj[3])[-1:]}")
    stamp_str=obj[0]
    stamp=datetime.strptime(str(stamp_str),'%Y%m%d%H%M')
    typhoon_temp=GeoTyphoonRealData(code=code,
                                date=stamp,
                                bp=obj[4],
                                wsm=obj[5],
                                level=obj[1],
                                latlon=Point(lat=lat,lon=lon))
    return typhoon_temp

In [161]:
typhoon_list=[]
for i in range(len(body)):
    typhoon_list.append(convert2Typhoon(body.iloc[i],"nameless"))
    
#     print(body.iloc[i])

In [162]:
typhoon_list[1]

<GeoTyphoonRealData: GeoTyphoonRealData object>

### S2 拼接经纬度
获取经纬度：  
    - 将字符串转成float，并且最后一位为小数点后一位

In [82]:
str(body.iloc[1][2])[:-1]

'10'

In [83]:
str(body.iloc[1][2])[-1:]

'7'

In [84]:
lat=float(f"{str(body.iloc[1][2])[:-1]}.{str(body.iloc[1][2])[-1:]}")

In [85]:
str(body.iloc[1][3])[:-1]

'129'

In [86]:
str(body.iloc[1][3])[-1:]

'0'

In [87]:
lon=float(f"{str(body.iloc[1][3])[:-1]}.{str(body.iloc[1][3])[-1:]}")

In [88]:
[lat,lon]

[10.7, 129.0]

In [49]:
temp=body.iloc[1]

In [50]:
float(f"{str(temp[2])[:-1]}.{str(temp[2])[-1:]}")

10.7

In [51]:
float(f"{str(temp[3])[:-1]}.{str(temp[3])[-1:]}")

129.0

### S3 获取当前时间

In [68]:
stamp_str=body.iloc[1][0]
stamp_str

2017041412

In [73]:
str(stamp_str)

'2017041412'

In [74]:
datetime.strptime(str(stamp_str),'%Y%m%d%H%M')

datetime.datetime(2017, 4, 14, 1, 2)

### S4 写入mongodb

In [91]:
for temp in typhoon_list:
    temp.save()

### S5 遍历起始list，自动获取header以及body

In [105]:
list_startend[2]

[54, 76]

In [99]:
list_startend[0][0]

0

In [100]:
header,body=getHeaderBody(1,25)

In [102]:
header[7]

'(nameless)'

In [125]:
# body
25>=len(body-1)

True

In [131]:
len(list_startend[-1:])

1

In [132]:
len(list_startend)

30

In [163]:
typhoon_list=[]
for start in list_startend:
    start_index=start[0]
    end_index=None
    if len(start)==1:        
        end_index=None
    else:
#         start_index=start[0]
        end_index=start[1]
    # 获取header与body
#     header,body=None
    header,body=getHeaderBody(start_index+1,end_index)
    # 从header中获取name
    typhoon_name=header[7]
    
    print(f"当前body长度{len(body)}")
    for i in range(len(body)):
        if i>=len(body)-1:
            print("跳出本次循环")
            break
        temp_typhoon=body.iloc[i] 
        print(f"当前i:{i}")
        print(f"当前index:{start_index+1+i}")
        print(f"当前body:\n{body.iloc[i+1]}")
        typhoon_list.append(convert2Typhoon(body.iloc[i+1],typhoon_name))
    print('--------')

当前body长度25
当前i:0
当前index:1
当前body:
0    2017041412
1             0
2           107
3          1290
4          1010
5            10
Name: 2, dtype: int64
当前i:1
当前index:2
当前body:
0    2017041418
1             0
2           107
3          1280
4          1010
5            10
Name: 3, dtype: int64
当前i:2
当前index:3
当前body:
0    2017041500
1             1
2           109
3          1272
4          1008
5            13
Name: 4, dtype: int64
当前i:3
当前index:4
当前body:
0    2017041506
1             1
2           111
3          1263
4          1008
5            13
Name: 5, dtype: int64
当前i:4
当前index:5
当前body:
0    2017041512
1             1
2           114
3          1254
4          1008
5            13
Name: 6, dtype: int64
当前i:5
当前index:6
当前body:
0    2017041518
1             1
2           114
3          1242
4          1008
5            13
Name: 7, dtype: int64
当前i:6
当前index:7
当前body:
0    2017041600
1             0
2           114
3          1231
4          1008
5            10
Name: 8, dtype: i

当前index:251
当前body:
0    2017072206
1             1
2           177
3          1120
4          1000
5            15
Name: 252, dtype: int64
当前i:5
当前index:252
当前body:
0    2017072212
1             2
2           177
3          1117
4           998
5            18
Name: 253, dtype: int64
当前i:6
当前index:253
当前body:
0    2017072218
1             2
2           177
3          1115
4           998
5            18
Name: 254, dtype: int64
当前i:7
当前index:254
当前body:
0    2017072300
1             2
2           176
3          1114
4           998
5            18
Name: 255, dtype: int64
当前i:8
当前index:255
当前body:
0    2017072306
1             2
2           175
3          1114
4           998
5            18
Name: 256, dtype: int64
当前i:9
当前index:256
当前body:
0    2017072312
1             2
2           174
3          1114
4           998
5            18
Name: 257, dtype: int64
当前i:10
当前index:257
当前body:
0    2017072318
1             2
2           172
3          1114
4           998
5            18
Name: 2

当前index:646
当前body:
0    2017101312
1             2
2           172
3          1188
4           992
5            20
Name: 647, dtype: int64
当前i:9
当前index:647
当前body:
0    2017101318
1             3
2           172
3          1185
4           985
5            25
Name: 648, dtype: int64
当前i:10
当前index:648
当前body:
0    2017101400
1             3
2           173
3          1183
4           985
5            25
Name: 649, dtype: int64
当前i:11
当前index:649
当前body:
0    2017101406
1             3
2           177
3          1180
4           982
5            28
Name: 650, dtype: int64
当前i:12
当前index:650
当前body:
0    2017101412
1             3
2           186
3          1176
4           980
5            30
Name: 651, dtype: int64
当前i:13
当前index:651
当前body:
0    2017101418
1             4
2           195
3          1166
4           970
5            35
Name: 652, dtype: int64
当前i:14
当前index:652
当前body:
0    2017101421
1             4
2           198
3          1157
4           965
5            38
Nam

In [164]:
# 写入mongodb
for temp in typhoon_list:
    temp.save()