In [7]:
import pandas as pd
import os
import numpy as np
from dataclasses import dataclass
from pathlib import Path

from itertools import permutations
from scipy.spatial import distance
from sklearn.preprocessing import MinMaxScaler


In [9]:
@dataclass
class EuclideanSimilarity:
    path:str
    filename:str
        
    def read_data(self):
        data = pd.read_excel(self.path.joinpath(self.filename))
        return data
    
    def get_sport_pairs(self,data:pd.Series)->list:
        """
        Itertools.permutations() gives you all possible tuples for a series and 
        the size of each tuple can be controlled by the second parameter
        Synatx: permutations(iterator, r)

        """
        columns_list = data
        interactions = list(permutations(columns_list, 2))
        return interactions

    def scale_continuous(sel,df:pd.DataFrame,col_list:list)->pd.DataFrame:
        """
        MinMaxScaler to scale the given series
        """
        scaler = MinMaxScaler()
        for col in col_list:
            print(col)
            scaled_arr = scaler.fit_transform(np.array(df[col]).reshape(-1,1))
            df[f"{col}_scaled"] = scaled_arr
            del df[col]
        return df

    
    def get_euclidian_distance(self,df:pd.DataFrame,interactions:list,scale:bool) -> np.array:            
        """
        Retrieve only attribute columns for euclidian distance calculation
        Calculate the euclidean distance using scipy.spatial i.e distance.euclidian()
        """
        cols = [i for i in df.columns if 'Sport' not in i if 'Rank' not in i if 'Total' not in i]
        if scale:
            scaled_df =  self.scale_continuous(df,cols)
            scaled_cols =  [ i for i in scaled_df.columns if 'scaled' in i]
            distances = [distance.euclidean(scaled_df.loc[df['Sport']==i[0],scaled_cols],scaled_df.loc[df['Sport']==i[1],scaled_cols],2) for i in interactions]

        else:
            distances = [distance.euclidean(df.loc[df['Sport']==i[0],cols],df.loc[df['Sport']==i[1],cols],2) for i in interactions]
        return distances

    def calculate_similarity(self,distances:list) -> list:
        """
        Similarity(x,y) =  1 / (1+dist(x,y))

        """
        similarities = [1/(dist+1) for dist in distances]
        return similarities

    @staticmethod
    def gen_similarity_df(combinations:tuple,distances:list,similarities:list):
        """
        Create the dataframe from the generated Sports-Indices, Distances and Similarity values

        """
        similarity_df = pd.DataFrame({'Sport-Indices':combinations,'Distance':distances,'Similarity':similarities})
        similarity_df['Sport_X'],similarity_df['Sport_Y'] =  similarity_df['Sport-Indices'].str
        return similarity_df

if __name__ == '__main__':
    current_folder =  Path.cwd().parent
    data_folder = current_folder.joinpath("DVI/data/raw/")
    filename = 'DVI_Assignment.xlsx'
    euclid_similarity =  EuclideanSimilarity(data_folder,filename)
    data = euclid_similarity.read_data()
    sport_combinations =  euclid_similarity.get_sport_pairs(data['Sport'])
    distances = euclid_similarity.get_euclidian_distance(data,sport_combinations,True)
    similarities =  euclid_similarity.calculate_similarity(distances)
    similarity_df =  EuclideanSimilarity.gen_similarity_df(sport_combinations,distances,similarities)

Endurance
Strength
Power
Speed
Agility
Flexibility
Nerve
Durability
Hand-Eye Coordination
Analytical Aptitude


  similarity_df['Sport_X'],similarity_df['Sport_Y'] =  similarity_df['Sport-Indices'].str


In [10]:
similarity_df

Unnamed: 0,Sport-Indices,Distance,Similarity,Sport_X,Sport_Y
0,"(Boxing, Ice Hockey)",0.829829,0.546499,Boxing,Ice Hockey
1,"(Boxing, Football)",0.789568,0.558794,Boxing,Football
2,"(Boxing, Basketball)",1.127190,0.470104,Boxing,Basketball
3,"(Boxing, Wrestling)",1.117640,0.472224,Boxing,Wrestling
4,"(Boxing, Martial Arts)",1.097587,0.476738,Boxing,Martial Arts
...,...,...,...,...,...
3535,"(Fishing, Archery)",1.130206,0.469438,Fishing,Archery
3536,"(Fishing, Curling)",0.999814,0.500047,Fishing,Curling
3537,"(Fishing, Bowling)",0.754874,0.569842,Fishing,Bowling
3538,"(Fishing, Shooting)",0.970834,0.507399,Fishing,Shooting
