Correlate the two sources of climate data: 1. IMS and 2. Igud Arim monitoring
See if there is a drift in either source

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [2]:
colab = False
if (colab):
  from google.colab import drive
  drive.mount('/content/drive')
  datadir = "/content/drive/My Drive/תחלואה/notebooks/data"
else:
  datadir = 'data'

In [3]:
# load IMS data
ims = pd.read_csv(datadir + '/weather.csv')
ims['date'] = pd.to_datetime(ims['date'])

# Keep just the climate stuff
ims = ims.filter(regex='(date|tmp_air_dry).*')

In [4]:
# Read IGUD sensor data
igud = pd.read_excel(datadir + "/StationData.xlsx")

igud.rename(columns = {'Unnamed: 0' : 'DateTime'}, inplace=True)
igud["DateTime"] = pd.to_datetime(igud["DateTime"])
igud["PollutionDate"] = igud["DateTime"].dt.date

# Many columns are empty or sparse. Require 80% availability of data per column
igud.dropna(thresh=len(igud)*0.8, axis=1, inplace=True)

# Keep just the climate stuff
igud = igud.filter(regex='.*(_TEMP|DateTime)')

# Some errounuous values in Carmelia station need to be filtered

igud.loc[igud['Carmelia_TEMP'] > 50, 'Carmelia_TEMP'] = np.nan

In [5]:
igud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 9 columns):
DateTime                  1096 non-null datetime64[ns]
Romema_TEMP               1037 non-null float64
Kiryat Ata_TEMP           1007 non-null float64
Neve Shaanan_TEMP         934 non-null float64
Neve Yosef_TEMP           1067 non-null float64
Carmelia_TEMP             957 non-null float64
Yizraelia_TEMP            1054 non-null float64
Igud (check-post)_TEMP    1051 non-null float64
D.CARMEL_TEMP             981 non-null float64
dtypes: datetime64[ns](1), float64(8)
memory usage: 77.1 KB


In [6]:
j = ims.merge(igud, left_on='date', right_on='DateTime')

In [7]:
print(j.columns.values)

['date' 'tmp_air_dry Bazan' 'tmp_air_dry Afek' 'tmp_air_dry Technion'
 'DateTime' 'Romema_TEMP' 'Kiryat Ata_TEMP' 'Neve Shaanan_TEMP'
 'Neve Yosef_TEMP' 'Carmelia_TEMP' 'Yizraelia_TEMP'
 'Igud (check-post)_TEMP' 'D.CARMEL_TEMP']


In [9]:
g = sns.PairGrid(j, diag_sharey=False)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot) # , colors="C0")
g.map_diag(sns.kdeplot, lw=2)

  return self.reduce(a)
  return self.reduce(a)


<seaborn.axisgrid.PairGrid at 0x12191cfd0>

In [None]:
# compute linear fit slopes
ims_cols = j.columns[j.columns.str.contains('_dry ')]
igud_cols = j.columns[j.columns.str.contains('_TEMP')]


In [None]:

ims_c = 'tmp_air_dry Afek'
igud_c = 'Romema_TEMP'

res_ims = []
res_igud = []
res_coef = []
res_score =[]

for ims_c in ims_cols:
    for igud_c in igud_cols:
        D = j[[ims_c, igud_c]].copy()
        D.dropna(inplace=True)
        D['dummy'] = 0

        X = D[['dummy', ims_c]]
        y = D[igud_c]
        reg = LinearRegression(fit_intercept=False).fit(X, y)
        res_ims.append(ims_c)
        res_igud.append(igud_c)
        res_score.append(reg.score(X, y))
        res_coef.append(reg.coef_[1])

In [None]:
F = pd.DataFrame({'ims': res_ims, 'igud' : res_igud, 'score' : res_score, 'coef' : res_coef})

In [None]:
chart = sns.barplot(data=F, x='igud', y='coef', hue='ims')
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.legend(loc='lower left')
plt.ylim(0.8, 1.2)
plt.title('Correlation coefficient')

In [None]:
chart = sns.barplot(data=F, x='igud', y='score', hue='ims')
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.legend(loc='lower left')
plt.ylim(0.75, 1)
plt.title('Correlation score')