In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from itertools import combinations

In [2]:
# FUNCTIONS

# Insert a list of all things you want to correlate and this will spit out every possible combo
# returns a symmetrical dataframe with a row and column for each dataframe in the list, and their correlation coefficient in the crosspoints
def all_correls(list_names, list_dfs):
    indices = list(range(len(list_names)))      #get a list of indices
    combos = list(combinations(indices, 2))     #get all possible combinations of indices, list of tuples

    correls_df = pd.DataFrame(columns=list_names)       # create symmetric dataframe
    correls_df.insert(0, 'Data', list_names)

    for combo in combos:
        print('\n==========================\n')
        print(f'Getting the correlation coefficient of {list_names[combo[0]]} x {list_names[combo[1]]}...')
        data_array_1, data_array_2 = extract_matching_data(list_dfs[combo[0]], list_dfs[combo[1]])
        corr = pearsonr(data_array_1, data_array_2)[0]
        correls_df[list_names[combo[0]]].loc[correls_df['Data'] == list_names[combo[1]]] = corr
        correls_df[list_names[combo[1]]].loc[correls_df['Data'] == list_names[combo[0]]] = corr

    print('Finished getting all correlations. Resultant correlation dataframe is: ')
    print(correls_df)

    return correls_df

# checks if number is valid (not NaN or in [-999, -1000))
is_valid = lambda value : value != float('nan') and not (value <= -999 and value > -1000)

# returns 2 arrays with chronologically matching data from two dataframes, getting rid of rows with invalid values
# assumes there are year columns in the dfs and that the data to compare is in the last column
# only use one df as reference point - since they're supposed to match it doesn't really matter anyways
def extract_matching_data(df_1, df_2):

    array_1 = []
    array_2 = []

    #cut to latest start year between the two and earliest end year to minimize iterations
    # ughhh I want to use the dataframe 'at' function but don't think it's quite right here
    starting_year = int(df_1.iloc[0]['Year']) if df_1.iloc[0]['Year'] > df_2.iloc[0]['Year'] else int(df_2.iloc[0]['Year'])
    ending_year = int(df_1.iloc[df_1.shape[0] - 1]['Year']) if df_1.iloc[df_1.shape[0] - 1]['Year'] < df_2.iloc[df_2.shape[0] - 1]['Year'] else int(df_2.iloc[df_2.shape[0] - 1]['Year'])

    # if the entries for years in both dfs contain relevant data, append to list - otherwise skip
    for year in range(starting_year, ending_year + 1):
        df_1_value = df_1.iloc[:, -1:].loc[df_1['Year'] == year].values[0][0]       # a little unwieldy but I guess it works
        df_2_value = df_2.iloc[:, -1:].loc[df_2['Year'] == year].values[0][0]
        if(is_valid(df_1_value) and is_valid(df_2_value)):
            array_1.append(df_1_value)
            array_2.append(df_2_value)
        else:
            continue

    return array_1, array_2

In [4]:
# LMR dfs
mpwapwa_lmr_prate = pd.read_csv('../LMR/tabora_files/mpwapwa_prate_all_mcrun.csv')
mpwapwa_lmr_pdsi = pd.read_csv('../LMR/tabora_files/mpwapwa_pdsi_all_mcrun.csv')
ujiji_lmr_prate = pd.read_csv('../LMR/tabora_files/ujiji_prate_all_mcrun.csv')
ujiji_lmr_pdsi = pd.read_csv('../LMR/tabora_files/ujiji_pdsi_all_mcrun.csv')
tabora_lmr_prate = pd.read_csv('../locale_data/LMR/tabora_prate_all_mcrun.csv')
tabora_lmr_pdsi = pd.read_csv('../locale_data/LMR/tabora_pdsi_all_mcrun.csv')

# PHYDA dfs
mpwapwa_phyda_pdsi = pd.read_csv('../PHYDA/mpwapwa_all_pdsi.csv')
ujiji_phyda_pdsi = pd.read_csv('../PHYDA/ujiji_all_pdsi.csv')
tabora_phyda_pdsi = pd.read_csv('../locale_data/PHYDA/tabora_all_pdsi.csv')

# REANALYSIS dfs
mpwapwa_reanalysis_prate = pd.read_csv('../REANALYSIS/mpwapwa_reanalysis_processed_prate.csv')
ujiji_reanalysis_prate = pd.read_csv('../REANALYSIS/ujiji_reanalysis_processed_prate.csv')
tabora_reanalysis_prate = pd.read_csv('../locale_data/REANALYSIS/tabora_reanalysis_processed_prate.csv')

# DAI PDSI dfs
mpwapwa_dai_pdsi = pd.read_csv('../DAI PDSI/dai_pdsi/mpwapwa_pdsi.csv')
mpwapwa_dai_psdi_sc = pd.read_csv('../DAI PDSI/dai_pdsi/mpwapwa_pdsi_sc.csv')
ujiji_dai_pdsi = pd.read_csv('../DAI PDSI/dai_pdsi/ujiji_pdsi.csv')
ujiji_dai_pdsi_sc = pd.read_csv('../DAI PDSI/dai_pdsi/ujiji_pdsi_sc.csv')
tabora_dai_pdsi = pd.read_csv('../locale_data/DAI/tabora_pdsi.csv')
tabora_dai_pdsi_sc = pd.read_csv('../locale_data/DAI/tabora_pdsi_sc.csv')

# go by order lmr --> phyda --> reanalysis --> dai externally, prate --> pdsi --> pdsi sc internally
dfs_dict = {
    'Mpwapwa': [mpwapwa_lmr_prate, mpwapwa_lmr_pdsi, mpwapwa_phyda_pdsi, mpwapwa_reanalysis_prate, mpwapwa_dai_pdsi, mpwapwa_dai_psdi_sc],
    'Ujiji': [ujiji_lmr_prate, ujiji_lmr_pdsi, ujiji_phyda_pdsi, ujiji_reanalysis_prate, ujiji_dai_pdsi, ujiji_dai_pdsi_sc],
    'Tabora': [tabora_lmr_prate, tabora_lmr_pdsi, tabora_phyda_pdsi, tabora_reanalysis_prate, tabora_dai_pdsi, tabora_dai_pdsi_sc]
}

locale_filenames = ['mpwapwa_correls', 'ujiji_correls', 'tabora_correls']

# Gwyn Request
# antananarivo_lmr_prate = pd.read_csv('../LMR/tabora_files/antananarivo_prate_all_mcrun.csv')
# antananarivo_lmr_pdsi = pd.read_csv('../LMR/tabora_files/antananarivo_pdsi_all_mcrun.csv')
# antananarivo_phyda_pdsi = pd.read_csv('../PHYDA/antananarivo_all_pdsi.csv')
# antananarivo_reanalysis_prate = pd.read_csv('../REANALYSIS/antananarivo_reanalysis_processed_prate.csv')
# antananarivo_dai_pdsi = pd.read_csv('../DAI PDSI/dai_pdsi/antananarivo_pdsi.csv')
# antananarivo_dai_pdsi_sc = pd.read_csv('../DAI PDSI/dai_pdsi/antananarivo_pdsi_sc.csv')
#
# dfs_dict = {
#     'Antananarivo': [antananarivo_lmr_prate, antananarivo_lmr_pdsi, antananarivo_phyda_pdsi, antananarivo_reanalysis_prate, antananarivo_dai_pdsi, antananarivo_dai_pdsi_sc]
# }
#
# locale_filenames = ['antananarivo_correls']

# list of names, not specific for a particular region, and assumes the above order
dfs_names = ['LMR PRATE', 'LMR PDSI', 'PHYDA PDSI', 'Reanalysis PRATE', 'DAI PDSI', 'DAI PDSI SC']

In [5]:
# get correlations for each locale

correl_dfs = []

for locale in dfs_dict.keys():
    correl_dfs.append(all_correls(dfs_names, dfs_dict[locale]))



Getting the correlation coefficient of LMR PRATE x LMR PDSI...


Getting the correlation coefficient of LMR PRATE x PHYDA PDSI...


Getting the correlation coefficient of LMR PRATE x Reanalysis PRATE...


Getting the correlation coefficient of LMR PRATE x DAI PDSI...


Getting the correlation coefficient of LMR PRATE x DAI PDSI SC...


Getting the correlation coefficient of LMR PDSI x PHYDA PDSI...


Getting the correlation coefficient of LMR PDSI x Reanalysis PRATE...


Getting the correlation coefficient of LMR PDSI x DAI PDSI...


Getting the correlation coefficient of LMR PDSI x DAI PDSI SC...


Getting the correlation coefficient of PHYDA PDSI x Reanalysis PRATE...


Getting the correlation coefficient of PHYDA PDSI x DAI PDSI...


Getting the correlation coefficient of PHYDA PDSI x DAI PDSI SC...


Getting the correlation coefficient of Reanalysis PRATE x DAI PDSI...


Getting the correlation coefficient of Reanalysis PRATE x DAI PDSI SC...


Getting the correlation coefficient

In [6]:
# DOWNLOAD FILES

for file in range(len(correl_dfs)):
    filename = f'C:\\Users\\Cecile Dai\\Documents\\Professional\\McGill University\\IOWC\\Other Datasets\\Alternative_Datasets\\REANALYSIS\\{locale_filenames[file]}.csv'
    correl_dfs[file].to_csv(filename, index=False)

In [60]:
tabora_filename = f'C:\\Users\\Cecile Dai\\Documents\\Professional\\McGill University\\IOWC\\Other Datasets\\Alternative_Datasets\\REANALYSIS\\{locale_filenames[2]}.csv'
correl_dfs[2].to_csv(tabora_filename, index=False)