In [1]:
from mongoengine import *
import os
import pandas as pd
import numpy as np
import re

使用** mongoengine **  
[connect](http://docs.mongoengine.org/guide/connecting.html#guide-connecting)进行数据库连接

In [2]:
connect('typhoon')

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

## 1 读取文件

### 读取方式1：使用pandas

In [3]:
targetfilename=r"CH2017BST.txt"
targetpath=r"/Users/casablanca/03project/typhoonSearchSys/demo_data"
fullname=os.path.join(targetpath,targetfilename)

In [4]:
fullname

'/Users/casablanca/03project/typhoonSearchSys/demo_data/CH2017BST.txt'

不使用pandas的read_table

In [5]:
with open(fullname,'rb') as f:
    data=pd.read_table(f,sep='\s+',encoding='utf-8',header=None,infer_datetime_format=False)
    print('读取成功')

读取成功


In [6]:
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,66666,0,25,0,1,0,6.0,(nameless),20180501.0
1,2017041406,0,109,1302,1010,10,,,
2,2017041412,0,107,1290,1010,10,,,


In [7]:
len(data)

857

In [8]:
data.iloc[0]

0          66666
1              0
2             25
3              0
4              1
5              0
6              6
7     (nameless)
8    2.01805e+07
Name: 0, dtype: object

--------

## 2 判断标志位的方式（必须同时满足），并获取标志位所在位置的数组（mark_indexs）

1-第一位是否为66666

In [9]:
data.iloc[0][0]==66666

True

2- 第7位是否不为Nan

In [10]:
data.iloc[0][7]!=np.nan

True

In [11]:
data.iloc[0][0]==66666 and data.iloc[0][7]!=np.nan

True

### 写成方法：

In [12]:
def checkMark(index):
    '''
        判断是否为标志位
    '''
    return data.iloc[index][0]==66666 and data.iloc[index][7]!=np.nan

找到标志位所在的位置

In [13]:
mark_indexs=[]
for i in range(len(data)):
#     print(i)
    if checkMark(i):
        mark_indexs.append(i)

In [31]:
# mark_indexs

In [14]:
mark_indexs[:5]

[0, 26, 54, 76, 98]

--------

## 3 获取观测值并写入mongoDB（暂时先写在list中）

In [15]:
mark_indexs[:3]

[0, 26, 54]

In [16]:
mark_indexs[1:3]

[26, 54]

In [32]:
# 保存起止数组的数组
list_startend=[]
index=0
for val in mark_indexs:
#     list_startend.append(mark_indexs[index:2])
#     print(mark_indexs[index:index+2])
    list_startend.append(mark_indexs[index:index+2])
    index=index+1
#     print(index)

In [36]:
list_startend[-2:]

[[789, 830], [830]]

[[789, 830], [830]]  
注意最后的数组是长度为1的数组，因为最后一次出现标志位后出现的都是观测数据，便结束了

In [18]:
list_startend[0:3]

[[0, 26], [26, 54], [54, 76]]

In [19]:
list_realdata=[]

** 获取第1行-25行，0-6列的数据 **

In [20]:
data.iloc[1:25,0:6].head()

Unnamed: 0,0,1,2,3,4,5
1,2017041406,0,109,1302,1010,10
2,2017041412,0,107,1290,1010,10
3,2017041418,0,107,1280,1010,10
4,2017041500,1,109,1272,1008,13
5,2017041506,1,111,1263,1008,13


In [39]:
data.iloc[1:25,0:6].iloc[1]

0    2017041412
1             0
2           107
3          1290
4          1010
5            10
Name: 2, dtype: int64

In [21]:
list_realdata.append(data.iloc[1:25,0:6])

In [22]:
# list_realdata

----

## 4 使用mongoEngine

In [23]:
from mongoengine import *

In [24]:
connect('typhoon')

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

In [25]:
# class typh

SyntaxError: invalid syntax (<ipython-input-25-b6475c2f6feb>, line 1)

定义 mongodb中的存储对象（ORM）

In [26]:
class Point(EmbeddedDocument):
    '''
        点（经纬度）
    '''
    lat=FloatField()
    lon=FloatField()
#     class my_metaclass:
#         allow_inheritance=True

class GeoTyphoonRealData(Document):
    '''
        支持geojson的存储至mongodb的model
    '''

    code=StringField(max_length=10)
    date=DateTimeField()
    bp=FloatField()
    wsm=FloatField()
    # latlon=models.ForeignKey(Point,on_delete=models.CASCADE)
    latlon=EmbeddedDocumentField('Point')
#     meta = {'db_alias': 'Typhoon_geotyphoonrealdata'}
#     meta = {'collection': 'geotyphoonrealdata'}
    meta = {'collection': 'Typhoon_geotyphoonrealdata'}
    # object=MongoDBManager()

In [27]:
import datetime

In [28]:
typhoon_temp=GeoTyphoonRealData(code="",
                                date=datetime.datetime.now(),
                                bp=1010.9,
                                wsm=5.4,
                                latlon=Point(lat=18.2,lon=97.9))

In [29]:
typhoon_temp.save()

<GeoTyphoonRealData: GeoTyphoonRealData object>

** 提示错误如下 **:  
NotUniqueError: Tried to save duplicate unique keys (E11000 duplicate key error collection: typhoon.Typhoon_geotyphoonrealdata index: __primary_key__ dup key: { : null })  
是mongodb中的对应document中可能设置了__primary_key__，我的解决办法是 ** 删除掉 ** 

### S1 根据标志位遍历df

循环遍历list_startend

In [49]:
for val in list_startend:
    if len(val)>1:        
        print(f"起始{val[0]}——终止{val[1]-1}")
    elif len(val)==1:
        print(f"起始{val[0]}")

起始0——终止25
起始26——终止53
起始54——终止75
起始76——终止97
起始98——终止113
起始114——终止196
起始197——终止230
起始231——终止245
起始246——终止280
起始281——终止312
起始313——终止346
起始347——终止377
起始378——终止405
起始406——终止432
起始433——终止454
起始455——终止495
起始496——终止518
起始519——终止535
起始536——终止589
起始590——终止612
起始613——终止626
起始627——终止636
起始637——终止662
起始663——终止698
起始699——终止729
起始730——终止751
起始752——终止775
起始776——终止788
起始789——终止829
起始830


In [45]:
list_startend[0]

[0, 26]

In [46]:
len(list_startend[0])

2

In [61]:
def getHeaderBody(start,end):
    '''
        根据df的起止位置截取其中的头部（66666	0	25	0	1	0	6.0	(nameless)	20180501.0），
        以及数据内容body（2017041406	0	109	1302	1010	10）
    '''
    header=data.iloc[start-1]
    body=data.iloc[start:end,0:6]
    return header,body

In [62]:
header,body=getHeaderBody(1,25)

In [63]:
header

0          66666
1              0
2             25
3              0
4              1
5              0
6              6
7     (nameless)
8    2.01805e+07
Name: 0, dtype: object

In [64]:
body.head(3)

Unnamed: 0,0,1,2,3,4,5
1,2017041406,0,109,1302,1010,10
2,2017041412,0,107,1290,1010,10
3,2017041418,0,107,1280,1010,10


In [65]:
len(body)

24

In [None]:
def convert2Typhoon(obj,code):
    '''
        根据传入的series将其转为typhoob Model
    '''
    lat=float(f"{str(obj[2])[:-1]}.{str(body.iloc[1][2])[-1:]}")
    lon=
    typhoon_temp=GeoTyphoonRealData(code=code,
                                date=datetime.datetime.now(),
                                bp=obj[4],
                                wsm=obj[5],
                                latlon=Point(lat=18.2,lon=97.9))

In [89]:
for i in range(len(body)):
#     print(body.iloc[i])

SyntaxError: unexpected EOF while parsing (<ipython-input-89-07416a4a1c49>, line 2)

In [67]:
body.iloc[i]

0    2017042000
1             0
2           208
3          1208
4          1010
5            10
Name: 24, dtype: int64

### S2 拼接经纬度
获取经纬度：  
    - 将字符串转成float，并且最后一位为小数点后一位

In [82]:
str(body.iloc[1][2])[:-1]

'10'

In [83]:
str(body.iloc[1][2])[-1:]

'7'

In [84]:
lat=float(f"{str(body.iloc[1][2])[:-1]}.{str(body.iloc[1][2])[-1:]}")

In [85]:
str(body.iloc[1][3])[:-1]

'129'

In [86]:
str(body.iloc[1][3])[-1:]

'0'

In [87]:
lon=float(f"{str(body.iloc[1][3])[:-1]}.{str(body.iloc[1][3])[-1:]}")

In [88]:
[lat,lon]

[10.7, 129.0]