In [16]:
import pandas as pd
import numpy as np

import glob
import os

import datetime as dt
from datetime import datetime,timedelta
from pytz import all_timezones

import json, requests
import pickle
import OpenSSL

In [None]:
#Mike Smith's functions
def download(apiurl,cache='use',verbose=False,apikey=None,cacheonly=None):
    """
    download(apiurl,cache='use',verbose=False,apikey=None):
    Loads thingspeak data from apiurl
    Set cache to:
        'use' - to use it
        'refresh' - to not use it
        'only' - to only use it
    cacheonly = if set, only cache this many previous training points,
       will only report this many when output. This is useful to avoid
       caches becoming arbitrarily large with historic data.
    """
    filename = 'channel%s.p'%apiurl.split('/')[-1]
    cachefile = os.path.isfile(filename)
    if (cache=='use' or cache=='only') and cachefile:
        alldata = pickle.load( open( filename, "rb" ) )
        if (cache=='only'):
            if verbose: print("Using just cache - may be out of date")
            return alldata
        if verbose: print("Using cache")
        nextid = alldata[-1]['entry_id']+1
        endtime = str_to_date(alldata[-1]['created_at'])+timedelta(seconds=1)
    else: #no cachefile or refresh -> we want to reload from the API
        if verbose: print("Ignoring/overwriting cache")
        if (cache=='only'):
            ##TODO Throw exception - can't only use cache as there is no cache
            assert False, "Can't only use cache as there is no cache"
        nextid = 1
        alldata = []
        endtime = None  
    if (cache=='only'): #we should stop now, and use the cached data we've got
        return alldata
        
    result = None
    if verbose: print("Using %d records from cache" % len(alldata))
    while result != -1:
        #thingspeak doesn't let you download ranges of ids, instead you have to
        #download ranges of dates. We can only download 8000 at a time, so we
        #need to get the date of the next one we need (then we ask for that datetime
        #until now, and repeat until we run out of new items).
        url = apiurl+'/feeds/entry/%d.json' % (nextid)
        if apikey is not None: url += '?api_key=%s' % apikey
        print("Loading from %s" % url)
        result = json.loads(requests.post(url, verify = True, timeout = 100.0).content.decode('utf-8'))
        starttime = endtime
        if result==-1:
            #if verbose: print("Warning: Unable to retrieve data (does channel exist? is it public?)")
            endtime = datetime.now()
        else:
            endtime = str_to_date(result['created_at'])
        if (nextid==1):
            starttime = endtime
        else:
            start = datetime.strftime(starttime,'%Y-%m-%dT%H:%M:%SZ')
            end = datetime.strftime(endtime-timedelta(seconds=1),'%Y-%m-%dT%H:%M:%SZ')
            url = apiurl+'/feeds.json?start=%s&end=%s' % (start,end)
            if apikey is not None: url += '&api_key=%s' % apikey
            print("Loading from %s" % url)                        
            data = json.loads(requests.post(url, verify = True, timeout = 100.0).content.decode('utf-8'))
            if (data!=-1):
                alldata.extend(data['feeds'])
                if verbose: print("    Adding %d records..." % len(data['feeds']))
            else:
                if verbose: print("Warning: unable to read data feed")
            
        nextid += 7999 #thought download was 8000 fields, but it's 8000 records. 8000/len(result)
    if verbose: print("New cache has %d records, saving." % len(alldata))
    
    if cacheonly is not None:
        pickle.dump( alldata[-cacheonly:], open( filename, "wb" ) )
    else:
        pickle.dump( alldata, open( filename, "wb" ) )
    return alldata
    
def str_to_date(st):
    return datetime.strptime(st,'%Y-%m-%dT%H:%M:%SZ')

In [3]:
#These should go in a database
metadata_path = r'D:\LILLIAN\AirQo\channel_metadata.csv'
channel_metadata = pd.read_csv(metadata_path, encoding = "ISO-8859-1")
channel_metadata.head()

Unnamed: 0,channel,channel_name,channel_id,read_api_key
0,AIRQO WB1 MOBILE UNIT ACTIVE,AQ_01,643676,MXMFGRF4ERL4VKI2
1,AIRQO-WB2 MOBILE UNIT ACTIVE,AQ_02,667402,A7E6OGD6QRIAVVK7
2,AIRQO-WB3 MOBILE UNIT ACTIVE,AQ_03,667406,1K40QAHJQLR3R6HB
3,AIRQO-WB4 UNIT ACTIVE,AQ_04,672528,YOW5ITSXCLW7IA0C
4,AIRQO-WB5 UNIT ACTIVE,AQ_05,675740,H3EY1WFQK4M2C2ZP


In [4]:
channel_ids = channel_metadata['channel_id']
channel_ids[:3]

0    643676
1    667402
2    667406
Name: channel_id, dtype: object

In [5]:
channel_keys= channel_metadata['read_api_key']
channel_keys[:3]

0    MXMFGRF4ERL4VKI2
1    A7E6OGD6QRIAVVK7
2    1K40QAHJQLR3R6HB
Name: read_api_key, dtype: object

In [6]:
channel_dict = dict(zip(channel_ids, channel_keys))
channel_dict.items()

dict_items([('643676', 'MXMFGRF4ERL4VKI2'), ('667402', 'A7E6OGD6QRIAVVK7'), ('667406', '1K40QAHJQLR3R6HB'), ('672528', 'YOW5ITSXCLW7IA0C'), ('675740', 'H3EY1WFQK4M2C2ZP'), ('675801', '0T0B4DV3ZXBIGF7L'), ('675805', 'T7I8LW37U46H7L9L'), ('675851', 'UJDHKVXFA7MCWUIN'), ('675991', 'WIK7QY0GKOYZFYB6'), ('676000', 'R1UWA4NKVLEWRT8A'), ('689508', 'W3TREK1FFIFCQET6'), ('689511', 'GZMTUYFZ3WKL09DA'), ('689516', 'N6OC9S4QP5R26D38'), ('\xa0689518', 'SP661YXSYJLE091E'), ('689520', 'LUY3VEB20UIE4G1I'), ('689522', 'KRK2MEGVMD22YY59'), ('689525', 'NWFG4DF3K0KEYT15'), ('689530', '9FWEYOXMJHJG0PEB'), ('689532', '2GGXH0YLJC992BLG'), ('689749', '1CF1BEEI1OJ67E3H'), ('689750', 'LWGIC3U8ZR004J04'), ('689752', '98KV1E257QS3SASB'), ('689753', '1FL0QL9OTDMN9T33'), ('689756', 'EIVEW4EZHD38VLN7'), ('689759', 'FG9PD1OUD9OLGZB3'), ('689761', 'YEGZIVUS16X2R4BO'), ('689766', '3XTH7YLBLURX0XQB'), ('689768', 'IKZL0YTFYDWN701G'), ('718028', 'HNTV5QEJTD8RTG2H'), ('718029', 'TRF8VHH9DWUKBT59'), ('718030', '2VDX6R4QQY92

In [8]:
channel_data_list = []
for channel_id, channel_key in channel_dict.items():
    channel_url = 'http://thingspeak.com/channels/'+str(channel_id)
    data = download(channel_url, verbose = True, apikey = channel_key)
    
    df = pd.DataFrame(data)#creating a dataframe of the data
    
    df['created_at'] =  pd.to_datetime(df['created_at']) #converting to DateTime format
    df['created_at'] = df['created_at'].dt.tz_convert('Africa/Kampala') #Converting from UTC to GMT
    
    df['channel_id'] = channel_id #adding an additional column
    print(channel_id, ":done!")
    channel_data_list.append(df)
    

Using cache
Using 76984 records from cache
Loading from http://thingspeak.com/channels/643676/feeds/entry/76985.json?api_key=MXMFGRF4ERL4VKI2
Loading from http://thingspeak.com/channels/643676/feeds.json?start=2019-06-23T20:43:37Z&end=2019-10-02T10:34:04Z&api_key=MXMFGRF4ERL4VKI2
    Adding 0 records...
New cache has 76984 records, saving.
643676 :done!
Using cache
Using 155831 records from cache
Loading from http://thingspeak.com/channels/667402/feeds/entry/155832.json?api_key=A7E6OGD6QRIAVVK7
Loading from http://thingspeak.com/channels/667402/feeds.json?start=2019-10-02T06:50:01Z&end=2019-10-02T06:50:33Z&api_key=A7E6OGD6QRIAVVK7
    Adding 0 records...
Loading from http://thingspeak.com/channels/667402/feeds/entry/163831.json?api_key=A7E6OGD6QRIAVVK7
Loading from http://thingspeak.com/channels/667402/feeds.json?start=2019-10-02T06:50:34Z&end=2019-10-02T10:34:38Z&api_key=A7E6OGD6QRIAVVK7
    Adding 61 records...
New cache has 155892 records, saving.
667402 :done!
Using cache
Using 114

ChunkedEncodingError: ('Connection broken: IncompleteRead(591 bytes read, 9649 more expected)', IncompleteRead(591 bytes read, 9649 more expected))

In [29]:
#data = download('http://thingspeak.com/channels/675991',verbose=True,apikey='WIK7QY0GKOYZFYB6') #AQ_09
#data = download('http://thingspeak.com/channels/718028',verbose=True,apikey='HNTV5QEJTD8RTG2H') #AQ_29
#data = download('http://thingspeak.com/channels/676000',verbose=True,apikey='R1UWA4NKVLEWRT8A') #AQ_10

try:
    data = download('http://thingspeak.com/channels/675991',verbose=True,apikey='WIK7QY0GKOYZFYB6') #AQ_09
except (SSL.Error, e):
    print (e)

Ignoring/overwriting cache
Using 0 records from cache
Loading from http://thingspeak.com/channels/675991/feeds/entry/1.json?api_key=WIK7QY0GKOYZFYB6




Loading from http://thingspeak.com/channels/675991/feeds/entry/8000.json?api_key=WIK7QY0GKOYZFYB6




Loading from http://thingspeak.com/channels/675991/feeds.json?start=2019-01-12T16:52:57Z&end=2019-01-28T18:26:04Z&api_key=WIK7QY0GKOYZFYB6




NameError: name 'SSL' is not defined

In [None]:
channel_data_list[0].head()

In [None]:
#Column names for the different types of sensors
PMS_heads_7 = ['time', 'entry_id', 'pm25', 'pm10', 's2_pm2_5', 's2_pm10', 'lat', 'long', 'voltage', 'latitude','longitude','elevation', 'status', 'device_name']
PMS_heads_8 = ['time', 'entry_id', 'pm2_5', 'pm10', 's2_pm2_5', 's2_pm10', 'lat', 'long', 'voltage', 'gps', 'latitude','longitude','elevation', 'status', 'device_name']
OPC_N2_heads_7 = ['time', 'entry_id', 'pm1', 'pm2_5', 'pm10', 'sample_period', 'lat', 'long', 'voltage', 'latitude','longitude','elevation', 'status', 'device_name']
OPC_N2_heads_8 = ['time', 'entry_id', 'pm1', 'pm2_5', 'pm10', 'sample_period', 'lat', 'long', 'voltage', 'gps', 'latitude','longitude','elevation', 'status', 'device_name']
PA_heads = ['time', 'entry_id', 'pm1', 'pm2_5', 'pm10', 'uptime', 'RSSI', 'temp', 'humidity', 'pm2_5_cf1', 'lat','long','elevation', 'status', 'device_name']

In [None]:
# creating a dictionary of the csv file names and their respective csv files
channel_dict = dict(zip(all_feeds, channel_list)) #where all_feeds=filenames and channel_list=channel dataframes

In [None]:
# Setting the column names for the different csv files based on the file name
for filename, dataframe in channel_dict.items():
                    
    rows,columns = channel_dict[filename].shape
    
    if (('AQ_' in filename) and (columns==15)):
        print (filename, ': AQ_15')
        channel_dict[filename].columns = PMS_heads_8         
            
    elif(('AQ_' in filename) and (columns==14)):
        print (filename, ': AQ_14')
        channel_dict[filename].columns = PMS_heads_7            
            
    elif ('PA' in filename):
        print (filename, ': PA')
        channel_dict[filename].columns = PA_heads
            
            
    elif (('8A' in filename) or ('6F' in filename)):
        print (filename, ': 8A/6F')
        channel_dict[filename].columns = OPC_N2_heads_8            
            
    else:
        print (filename, ': the rest')
        channel_dict[filename].columns = OPC_N2_heads_7  

In [None]:
#Adding latitude and longitude coordinates for Purple Air Sensors
for filename, dataframe in channel_dict.items():
    if ('PA_01' in filename): #International School Lubowa
        dataframe['lat'] = 0.2357
        dataframe['long'] = 32.5576
        #print (dataframe.head())
    elif ('PA_02' in filename): #Makerere
        dataframe['lat'] = 0.332050  #estimates
        dataframe['long'] = 32.570509
    elif ('PA_03' in filename): #Kabale
        dataframe['lat'] = -1.245 
        dataframe['long'] = 29.9892
    elif ('PA_04' in filename): #Bunamwaya
        dataframe['lat'] = 0.27
        dataframe['long'] = 32.558


In [None]:
#Combining all the data into one dataframe
combined_channel_data = pd.concat(channel_dict, join='outer', sort=False, ignore_index=True)
print (combined_channel_data.shape)
combined_channel_data.head()

In [None]:
print (combined_channel_data.columns)

In [None]:
#Rearranging columns to make more sense
combined_channel_data = combined_channel_data[['time', 'device_name','entry_id', 'pm1', 'pm2_5', 'pm10', 's2_pm2_5', 's2_pm10', 
                                               'lat', 'long', 'voltage', 'sample_period', 'uptime', 'RSSI', 'temp', 
                                               'humidity', 'latitude', 'longitude', 'elevation', 'status','gps', 'pm2_5_cf1']]

combined_channel_data.shape

In [None]:
#Creating a csv from the combined dataframe
joined_csvs = combined_channel_data.to_csv('airqo_master_AUG_19_proposed.csv', index = False)

In [None]:
print (combined_channel_data.columns)
combined_channel_data.info()

In [None]:
#Dropping unimportant columns in analysis and modelling
drop_columns = ['entry_id', 'voltage', 'sample_period', 'uptime', 'RSSI', 'temp', 'humidity', 
                'latitude', 'longitude', 'elevation', 'status', 'gps', 'pm2_5_cf1']

In [None]:
final_data = combined_channel_data.drop(drop_columns, axis = 1)

In [None]:
final_data.tail()

In [None]:
#setting the time column as the index
final_indexed_data = final_data.set_index('time')

In [None]:
final_indexed_data.head()

In [None]:
#Finding out how many duplicate indices are there in the final data
idx = pd.Index(final_indexed_data)
duplicates = idx.duplicated()
count = 0
for duplicate in duplicates:
    if (duplicate == True):
        count+=1
print (count)
    

In [None]:
final_indexed_data.info()

In [None]:
#Changing the datatype of the columns from object to float
cols = ['pm1', 'pm2_5', 'long', 's2_pm2_5', 'pm10']
for col in cols:
    final_indexed_data[col] = pd.to_numeric(final_indexed_data[col], errors='coerce')

In [None]:
final_indexed_data.info()

In [None]:
final_indexed_data.describe()

### Latitude and Longitude

In [None]:
# Finding out how many latitude or longitude entries have 0 or 1000
bad_lat = []
bad_long = []

for lat in final_indexed_data['lat']:
    if ((lat==0) or (lat==1000)):
        bad_lat.append(lat)
print ('Bad latitudes are ', len(bad_lat))
for long in final_indexed_data['long']:
    if ((long==0) or (long==1000)):
        bad_long.append(long)
print ('Bad longitudes are ', len(bad_long))

In [None]:
#Replacing latitude and longitude values of 0 and 1000 with Null
final_indexed_data['lat'] = final_indexed_data['lat'].replace([0,1000], np.nan)
final_indexed_data['long'] = final_indexed_data['long'].replace([0,1000], np.nan)
final_indexed_data.head(5)

In [None]:
final_indexed_data['lat'] = final_indexed_data['lat'].fillna(method = 'bfill')
final_indexed_data['long'] = final_indexed_data['long'].fillna(method = 'bfill')
final_indexed_data.head()

In [None]:
print (final_indexed_data['long'].isnull().sum())
print (final_indexed_data['lat'].isnull().sum())

In [None]:
#Dropping data with gps coordinates outside uganda
outside_indices = final_indexed_data.loc[(final_indexed_data.lat < -1.482074) | (final_indexed_data.lat > 4.221103) | 
                                         (final_indexed_data.long < 29.571283) | (final_indexed_data.long > 35.022797)].index
print(len(outside_indices))
final_indexed_data.drop(outside_indices, inplace = True)
final_indexed_data.info()

In [None]:
#Creating a csv from the combined dataframe
data_to_database = final_indexed_data.to_csv('database_data.csv', index = False)

In [46]:
#defining a function for reading data from specified url
def read_channel_data(channel_name, channel_id):
    channel_url = '{0}/{1}/feeds.{2}'.format(base_url, channel_id, 'json')
    try:
        response = requests.get(channel_url,timeout=50.0)
    except requests.TimeOut as err:
        print(err.message)
    
    if (response.status_code == 200):
        print(channel_name+ ' :Successful!')
        return json.loads(response.content.decode('utf-8'))
    
    else:
        print(channel_name+' :Failed!')
        return None