In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# load mosquito data from Dryad (https://datadryad.org/stash/dataset/doi:10.5061/dryad.47v3c)
df = pd.read_csv('Data/aegypti_albopictus.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42066 entries, 0 to 42065
Data columns (total 12 columns):
vector           42066 non-null object
occurrence_id    42066 non-null int64
source_type      42066 non-null object
location_type    41586 non-null object
polygon_admin    40960 non-null object
latitude         42066 non-null float64
longitude        42066 non-null float64
year             40828 non-null object
country          42066 non-null object
country_id       42066 non-null object
gaul_ad0         42066 non-null int64
status           331 non-null object
dtypes: float64(2), int64(2), object(8)
memory usage: 3.9+ MB


In [6]:
df.columns = [x.lower() for x in df.columns]

# x is longitude and y is latitude
df.rename(columns={'x':'longitude', 'y':'latitude'}, inplace=True)

In [13]:
# clean and format data
mosquito = pd.concat([df.loc[df.year=='2006-2008'].assign(year=x) 
                      for x in ['2006', '2007', '2008']])

mosquito = pd.concat([df.loc[df.year!='2006-2008'], mosquito])
mosquito = mosquito.loc[mosquito.year.notnull()]
mosquito['year'] = mosquito.year.astype(int)
mosquito.sort_values('year',inplace=True)

In [14]:
mosquito.head()

Unnamed: 0,vector,occurrence_id,source_type,location_type,polygon_admin,latitude,longitude,year,country,country_id,gaul_ad0,status
0,Aedes aegypti,1,published,point,-999,-3.22,40.07,1958,Kenya,KEN,133,
26,Aedes aegypti,27,unpublished,polygon,2,27.18,-82.32,1960,United States of America,USA,259,
27,Aedes aegypti,28,unpublished,polygon,2,27.18,-81.81,1960,United States of America,USA,259,
28,Aedes aegypti,29,unpublished,polygon,2,27.34,-81.34,1960,United States of America,USA,259,
29,Aedes aegypti,30,unpublished,polygon,2,27.38,-80.49,1960,United States of America,USA,259,


In [16]:
# take data only for year > 2006
mosquito = mosquito.loc[mosquito.year>=2006]

In [19]:
mosquito.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24019 entries, 34478 to 21354
Data columns (total 12 columns):
vector           24019 non-null object
occurrence_id    24019 non-null int64
source_type      24019 non-null object
location_type    23769 non-null object
polygon_admin    23368 non-null object
latitude         24019 non-null float64
longitude        24019 non-null float64
year             24019 non-null int64
country          24019 non-null object
country_id       24019 non-null object
gaul_ad0         24019 non-null int64
status           183 non-null object
dtypes: float64(2), int64(3), object(7)
memory usage: 2.4+ MB


In [22]:
# save csv for future needs
mosquito.to_csv('mosquito.csv', encoding='utf-8', index=False)