In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

In [6]:
"""
DATA INPUT AND REDUCTION
"""
LDA_log_filepath = "/Users/ishraque/Desktop/lda_log_final.csv" #insert the filepath of the lda_log_final.csv file

LDA_log = pd.read_csv(LDA_log_filepath, sep=",")

LDA_log = LDA_log.dropna()

rename_columns = {'Abslat':'Latitude (Degrees)', 'PC_Azimuth':'Pole Corr. Azimuth', 'Cos_PC_Azimuth':'Cosine Azimuth', 'scarp_slope':'Scarp Slope', 'LDA_slope':'LDA Slope', 'Max_LDA_elev':'Max LDA Elevation', 'num_mapped_boulders':'Number of Boulders', 'FirstPick':'First Pick Clusters', 'Nominal':'Nominal Clusters', 'Preferred':'Preferred Clusters', 'Length':'Length (m)', 'ASD_num':'Number of ASD'}

LDA_log.rename(columns=rename_columns, inplace=True)

# reducing down to the relevant variables
reduce_data_columns = ['Latitude (Degrees)', 'Pole Corr. Azimuth', 'Cosine Azimuth', 'Scarp Slope', 'LDA Slope', 'Max LDA Elevation', 'Number of Boulders', 'Nominal Clusters', 'Preferred Clusters', 'First Pick Clusters', 'Length (m)', 'Craters + CLF', 'Number of ASD']

# reducing variables further to simplify analysis 
final_reduce_data_columns = ['Latitude (Degrees)', 'Cosine Azimuth', 'Number of Boulders', 'Nominal Clusters', 'Preferred Clusters', 'Length (m)']
LDA_log.columns

Index(['Image', 'Lat', 'Long', 'Latitude (Degrees)', 'Nominal Clusters',
       'Preferred Clusters', 'First Pick Clusters', 'Length (m)', 'Azimuth',
       'Pole Corr. Azimuth', 'Cosine Azimuth', 'Number of Boulders',
       'LDA_relief', 'Min_LDA_elev', 'Max LDA Elevation', 'Max_scarp_elev',
       'Dist_LDA_Scarp', 'Scarp Slope', 'LDA Slope', 'phase_angle',
       'Craters + CLF', 'Brain Terrain? (Y/N)', 'Polygons?(Y/N)',
       'Scarp_cluster', 'Moraine_like_cluster', 'ASD_like_bands',
       'Number of ASD'],
      dtype='object')

In [88]:
"""
CONDITIONAL FILTERING OF DATA 
"""
data_filter = reduce_data_columns # can also use final_reduce_data_columns

#separating the data
LDA_plot_data = LDA_log[data_filter]

#Data with brain terrain
brain_LDA = LDA_log.loc[LDA_log['Brain Terrain? (Y/N)'] == 'Y'].reset_index(drop=True, inplace=False)
brain_LDA = brain_LDA[data_filter]

#data without brain terrain
no_brain_LDA = LDA_log.loc[LDA_log['Brain Terrain? (Y/N)'] == 'N'].reset_index(drop=True, inplace=False)
no_brain_LDA = no_brain_LDA[data_filter]

#Data with mantle
mantle_LDA = LDA_log.loc[LDA_log['Polygons?(Y/N)'] == 'Y'].reset_index(drop=True, inplace=False)
mantle_LDA = mantle_LDA[data_filter]

#data without mantle
no_mantle_LDA = LDA_log.loc[LDA_log['Polygons?(Y/N)'] == 'N'].reset_index(drop=True, inplace=False)
no_mantle_LDA = no_mantle_LDA[data_filter]

#data with ASD_like_bands
ASD_LDA = LDA_log.loc[LDA_log['ASD_like_bands'] == 'Y'].reset_index(drop=True, inplace=False)
ASD_LDA = ASD_LDA[data_filter]

In [None]:
# Function to calculate correlation coefficient between two arrays
def corr(x, y, **kwargs):
    
    # Calculate the value
    coef_pear = np.corrcoef(x, y)[0][1]
    coef_spear, pval = spearmanr(x, y)
    # Make the label
    label1 = r'$\rho _{p}$ = ' + str(round(coef_pear, 2))
    label2 = r'$\rho _{s}$ = ' + str(round(coef_spear, 2))
    
    # Add the label to the plot
    ax = plt.gca()
    ax.annotate(label1, xy = (0.07, 0.93), size = 15, xycoords = ax.transAxes)
    ax.annotate(label2, xy = (0.6, 0.93), size = 15, xycoords = ax.transAxes)

In [None]:
#DETAIL PLOTS  (These are scatterplots overlain on kernel density diagrams, with the color bar indicating the number of mapped boulders in each data point)

selected_data = ASD_LDA

#select the variables and plot
indep = 'Length (m)'
dep = 'Preferred Clusters'

#selecting the file from the dataframe
x = selected_data[indep]
y = selected_data[dep]

cm = plt.cm.get_cmap('RdYlBu')
grey = sns.light_palette("grey", as_cmap=True)

f, den = plt.subplots(figsize=(10,6))
den = sns.kdeplot(x,y,shade=True, cmap=grey)
den = plt.scatter(x, y, c=selected_data['Number of Boulders'], cmap=cm)
plt.colorbar(den, label='Number of mapped boulders')
den = corr(x,y)

if selected_data is LDA_plot_data:
    cat = 'Total_'
elif selected_data is brain_LDA:
    cat = 'Brain_'
elif selected_data is no_brain_LDA:
    cat = 'No_Brain_'
elif selected_data is mantle_LDA:
    cat = 'Mantle_'
elif selected_data is no_mantle_LDA:
    cat = 'No_Mantle_'

#f.savefig(cat+indep+'_vs_'+dep+'.png')

In [None]:
#sns.set_style("whitegrid", {'axes.grid' : False})
#sns.set_style("ticks", {"xtick.major.size": 2, "ytick.major.size": 2})
#plt.rcParams["axes.labelsize"] = 15
#sns.set(font_scale=1.15)
#colormap for pairplots

color_map = sns.light_palette("skyblue", as_cmap=True)
#Pairplots
grid = sns.PairGrid(data = selected_data)
grid = grid.map_upper(plt.scatter)
grid = grid.map_upper(corr)
grid = grid.map_diag(plt.hist, bins = 10)
grid = grid.map_lower(sns.kdeplot, cmap=color_map, shade=True)
grid = grid.map_lower(sns.scatterplot)
grid.savefig('pairplot_ASD_reduced.svg')

In [30]:
# The cells below explore correlations and significant correlations

rho, pval = spearmanr(x, y)
print(pval)

0.023003668753480946


In [89]:
import pingouin
import scipy.spatial.distance as dist
import scipy.stats as ss

In [90]:
test_data = LDA_log[data_filter]

In [86]:
correlation = test_data.corr(method = 'pearson')

In [None]:
plt.figure(num=None, figsize=(15, 12), dpi=256, facecolor='w', edgecolor='k')
plt.title('Spearman Correlation Matrix')
sns_plot = sns.heatmap(correlation, annot=True)
plt.show
plt.savefig("Pearson_correlation_matrix_heatmap_reduced.svg")

In [108]:
pg_data1 = test_data.rcorr(method='spearman', upper='pval', stars=False)
pg_data2 = test_data.rcorr(method='spearman', upper='pval', stars=True)

In [None]:
pg_data1

In [None]:
pg_data2