In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
%matplotlib inline
#pandas settings
pd.set_option('max_colwidth',250)
pd.set_option('max_columns',250)
pd.set_option('max_rows',500)

## Load Data

In [None]:
train = pd.read_csv('../input/duth-dbirlab2-1/train.csv')
test = pd.read_csv('../input/duth-dbirlab2-1/test.csv')

In [None]:
train.head()

# Plots

## Probability-Distance to coast

In [None]:
sns.set(font_scale = 2)
ax = sns.jointplot(train['distanceToCoast'],train['Overall Probability'],height=13, kind='kde')
ax.set_axis_labels('distance to coast','probability')
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":18,"axes.labelsize":20}) 

### We can see that the most of our observations are within 100 km from the coast

## Coordinates-Probability

In [None]:
from mpl_toolkits.basemap import Basemap


plt.style.use('bmh')
plt.figure(figsize=(20,15))
basemap = Basemap(llcrnrlon=-5,llcrnrlat=28,urcrnrlon=40,urcrnrlat=45,
             resolution='l', projection='tmerc', lat_0 = 35, lon_0 = 15)

basemap.bluemarble()
# map.drawmapboundary(fill_color='white')
# map.fillcontinents(color='#ddaa66',lake_color='aqua')
basemap.drawcoastlines()

x,y = basemap(train['Center Long'].values, train['Center Lat'].values)
basemap.scatter(x,y,c=train['Overall Probability'].values,marker='o')

plt.colorbar(label='Probability')
plt.show()

plt.style.use('ggplot')

### we can see how the probability fluctuates with the coordinates in a real world map. Western Mediterranean and North Aegean have higher probabilities of finding the fish  

# Bathymetry-Probability

In [None]:
ax = sns.jointplot(train['bathymetry'],train['Overall Probability'],height=13, kind='kde')
ax.set_axis_labels('bathymetry','probability')
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":18,"axes.labelsize":20}) 

### This graph shows that: Overall Probability is equally distributed as for the depth and thus, deapth/bathymetry doesnt play a role in finding the fish. Also it shows that most of our observations happen on shallow waters

# The distribution of categorical data,substray type, over our observations

In [None]:
fig, ax = plt.subplots(nrows = 2, ncols = 1,figsize=(15, 15))

sns.countplot(train['substrateType'],ax=ax[0])

ax[0].set_xlabel(None)
ax[0].tick_params(axis='y', which='major', labelsize=15)
# ax.yaxis.tick_left() # where the y axis marks will be
ax[0].tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False)


sns.boxenplot(train['substrateType'], train['Overall Probability'], ax = ax[1])
sns.stripplot(train['substrateType'], train['Overall Probability'], ax = ax[1], color=".3")
ax[1].set_xlabel(None)
ax[1].set_title('Probability per substrate type', fontsize = 15, loc='center')
# ax[1].set_xlabel('substrate type', fontsize = 13)
ax[1].set_ylabel('Probability', fontsize = 13)
ax[1].tick_params(axis='y', which='major', labelsize=15)
ax[1].tick_params(axis='x', which='major', labelsize=15)
# ax[1].yaxis.tick_left() # where the y axis marks will be
plt.xticks(rotation=45); # semicolon to supress output

### In order to extract valid conclusions about these plots we have to keep in mind the count of samples. Sandy mud in which we have the most samples seems to be randomly distributed, we can see that the central tendency is just under 0.6, while the substrate with the second highest sample number, sand, has central tendency of about 0.8 this tells us that sand may have an effect on probability

## Mean Temperature-Probability

In [None]:
ax = sns.jointplot(train['temperatureSurface_mean'],train['Overall Probability'],height=13, kind='kde')
ax.set_axis_labels('mean temperature','probability')
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":18,"axes.labelsize":20}) 

## We clearly see a spike of high probability on around 19°C  and another one of low probability on 23°C.. temperature seems to play some role 

### Creating a sub dataframe to reduce the decimals of probability for smoother plots.

In [None]:
temp_df = train[['salinitySurface_mean','chlorophyll_mean','Overall Probability']]
temp_df.loc[:,'Overall Probability'] = train.loc[:,'Overall Probability'].map(lambda key: np.round(key,decimals=1))

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

sns.scatterplot(temp_df['chlorophyll_mean'], temp_df['salinitySurface_mean'], hue=temp_df['Overall Probability'], palette='Reds',
                size=temp_df['Overall Probability'], legend="full")

ax.set_title('Probability of appearance', fontsize = 15, loc='center') 
ax.set_xlabel('chlorophill', fontsize = 13) 
ax.set_ylabel('salinity', fontsize = 13) 
plt.tick_params(axis='y', which='major', labelsize=12) 
plt.tick_params(axis='x', which='major', labelsize=12) 
ax.yaxis.tick_left() # where the y axis marks will be

# ax.position([0,0,15,15])
# ax.set(xlim=(-1, 40), ylim=(-1, 60))

ax.axis('equal')

### This show a subtle evidence that the lowest the salinity the greater the probability..most of our chlorophil samples are around the same value so we cant make valid conclusions about this feature.

## Salinity surface-mean Probability

In [None]:

ax = sns.jointplot(train['salinitySurface_mean'],train['Overall Probability'],height=13, kind='kde')
ax.set_axis_labels('salinity surface mean','probability')
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":18,"axes.labelsize":20})

### Salinity around 38 units seems to be correlated with higher probability of finding the fish 

## Chrolophyll

In [None]:
train['chlorophyll_mean'].hist(bins= 20)

In [None]:
print('min Chlorophyll level:', train['chlorophyll_mean'].min())
print('max Chlorophyll level:', train['chlorophyll_mean'].max())
count = 0
for elem in train['chlorophyll_mean']:
    if elem>1:
        count+=1
print('Sample percentage that is greater than 1:', count*100/437,'%')
print('Mean:', train['chlorophyll_mean'].mean())
print('Ssd:', train['chlorophyll_mean'].std())
print('Skewness:', train['chlorophyll_mean'].skew())
print('Kurtosis:', train['chlorophyll_mean'].kurt())

### As we can see this variable has a large asymmetrical distribution (This is called Skewness). To reduce skewness and make this variable easier to handle, i will replace its values with the log10 of them, since they are all greater than zero.

In [None]:
normalise_chlorophyl = np.log10(train['chlorophyll_mean'])
#normalise_chlorophyl = np.reciprocal(train['chlorophyll_mean']) #1/x gives lower Skewness & Kurtosis level

print('min transformed Chlorophyll level:', normalise_chlorophyl.min())
print('max transformed Chlorophyll level:', normalise_chlorophyl.max())
print('Skewness:', normalise_chlorophyl.skew())
print('Kurtosis:', normalise_chlorophyl.kurt())

In [None]:
plt.hist(normalise_chlorophyl,bins=20)

### much better distribution

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

sns.scatterplot(normalise_chlorophyl, train['Overall Probability'])

#plt.xlim(0,1) 
ax.set_xlabel('Chlorophyl', fontsize = 13) 
ax.set_ylabel('OverProp', fontsize = 13) 
plt.tick_params(axis='y', which='major', labelsize=12) 
plt.tick_params(axis='x', which='major', labelsize=12) 
ax.yaxis.tick_left() # where the y axis marks will be

### as chlorophyll gets from -1.5 to -1.0 there is a clear evidence that the probability gets higher

### as we can see from the plot below as well

In [None]:
ax = sns.jointplot(normalise_chlorophyl,train['Overall Probability'],height=13, kind='kde')
ax.set_axis_labels('chlorophyll','Overall propability')
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":18,"axes.labelsize":20})

### It seems that there is high possibility of finding the fish around [-1,-0.8] (0.15 from the original metrics) chlorophyl level

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
sns.scatterplot(x= train['temperatureSurface_mean'], y= normalise_chlorophyl, hue=train['Overall Probability'], palette='Reds', size=train['Overall Probability'])

#plt.ylim(0, 1)
ax.set_ylabel('chlorophyll', fontsize = 13)
ax.set_xlabel('temperature', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### As the temperature gets lower the chlorophyl levels get higher and so does the overall possibility. In the range [18,20] Celcius degree seems that it is more likely to find the fish

In [None]:
ax = sns.jointplot(train['temperatureSurface_mean'],normalise_chlorophyl,height=13, kind='kde')
ax.set_axis_labels('temp','chlorophyll')
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":18,"axes.labelsize":20})

### For 19 Celcius Degrees the chlorophyll level peaks at [-1.0,0,9] (0.15). As we saw earlier there is a peak of high overall probability for this temperature as well. This may mean that the ideal chlorophyl level for finding the fish is around 0,15

## Chrolophyll_mean - bathymetry - Probability

In [None]:
fig, ax = plt.subplots(figsize=(15,8))

sns.scatterplot(np.log10(train['chlorophyll_mean']), train['bathymetry'], hue=train['Overall Probability'])
ax.set_title('Chlorophyll & Bathymetry', fontsize = 15, loc='center')
ax.set_ylabel('Bathymetry', fontsize = 13)
ax.set_xlabel('Chlorophyll', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### Chlorophyll levels get lower as Bathymetry rises

## Normalized chlorophyll - secchiDiskDepth_mean - Probability

In [None]:
fig, ax = plt.subplots(figsize=(15,8))

sns.scatterplot(normalise_chlorophyl, train['secchiDiskDepth_mean'], hue=train['Overall Probability'])
ax.set_title('Chlorophyl effects on Secchi', fontsize = 15, loc='center')
ax.set_ylabel('Secchi', fontsize = 13)
ax.set_xlabel('Chlorophyll_mean', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### Naturally greater levels of Chlorophyll decrease Secchi Disk Depth (water clarity). We observe that the propabilty gets higher as the clarity gets lower

## Temperature

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
sns.scatterplot(x= train['temperatureSurface_mean'], y=train['dissolvedOxygenSurface_mean'],  hue=train['Overall Probability'], palette='Reds', size=train['Overall Probability'])

#plt.xlim(0, 0.40)
ax.set_ylabel('Dissolved Oxygen', fontsize = 13)
ax.set_xlabel('Temperature', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### For lower temperatures we have greater dissolved oxygen levels and greater propabilities of catching the fish

## Current

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
#plt.xlim(-0.1,0.1)
#plt.ylim(-0.1,0.1)
sns.scatterplot(train['zonalCurrentSurface_mean'], train['meridionalCurrentSurface_mean'], hue=train['Overall Probability'], palette='Reds')
ax.set_ylabel('Meridional Curr', fontsize = 13)
ax.set_xlabel('Zonal Curr', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.text(-0.01, 0.2, 'North', fontsize=15)
ax.text(-0.01, -0.3, 'South', fontsize=15)
ax.text(-0.25, 0.0, 'West', fontsize=15)
ax.text(0.3, 0.0, 'East', fontsize=15)

#If the arrows annoy you set these functions as comments 
ax.arrow(x = 0.0, y = -0.3, dx = 0, dy=0.5, color='blue')
ax.arrow(x=-0.25, y=0.0, dx=0.55, dy=0, color='blue')

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### East high intensity currents seems to raise the overall propability. 

### For the same spots, we can see the chlorophyll levels bellow.

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
sns.scatterplot(train['zonalCurrentSurface_mean'], train['meridionalCurrentSurface_mean'], hue=normalise_chlorophyl, palette='Reds')
ax.set_ylabel('Meridional Curr', fontsize = 13)
ax.set_xlabel('Zonal Curr', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.text(-0.01, 0.2, 'North', fontsize=15)
ax.text(-0.01, -0.3, 'South', fontsize=15)
ax.text(-0.25, 0.0, 'West', fontsize=15)
ax.text(0.3, 0.0, 'East', fontsize=15)

#If the arrows annoy you set these functions as comments 
#ax.arrow(x = 0.0, y = -0.3, dx = 0, dy=0.5, color='blue')
#ax.arrow(x=-0.25, y=0.0, dx=0.55, dy=0, color='blue')

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### chlorophyll raise at North East currents

## Euphotic Depth

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
sns.scatterplot(x= normalise_chlorophyl, y=train['euphoticDepth_mean'] , hue=train['Overall Probability'], palette='Reds', size=train['Overall Probability'])

#plt.ylim(0, 1)
ax.set_ylabel('euphotic', fontsize = 13)
ax.set_xlabel('chlorophyl', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### As the Euphotic Depth level gets higher, so does the Chlorophyl and the Overall Probability

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
sns.scatterplot(x= train['dissolvedOxygenSurface_mean'], y=train['euphoticDepth_mean'] , hue=train['Overall Probability'], palette='Reds', size=train['Overall Probability'])

#plt.ylim(0, 1)
ax.set_ylabel('euphotic', fontsize = 13)
ax.set_xlabel('Diss Ox', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.legend(loc='upper right') #if multiple figures, they have to contain label=''

### Higher Euphotic Depth - Higher Chlorophyll - Higher Diss Ox

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
sns.scatterplot(x= train['temperatureSurface_mean'], y=train['euphoticDepth_mean'] , hue=train['Overall Probability'], palette='Reds', size=train['Overall Probability'])

#plt.ylim(0, 1)
ax.set_ylabel('euphotic', fontsize = 13)
ax.set_xlabel('temp', fontsize = 13)
plt.tick_params(axis='x', which='major', labelsize=12)
plt.tick_params(axis='y', which='major', labelsize=10)
ax.yaxis.tick_left() # where the y axis marks will be
plt.yticks(rotation=30)

ax.legend(loc='upper right') #if multiple figures, they have to contain label='

### Naturally the temperature gets lower