In [1]:
import pandas as pd
import os
import numpy as np
from dataclasses import dataclass
from pathlib import Path

from itertools import permutations
from scipy.spatial import distance
from sklearn.preprocessing import MinMaxScaler


In [4]:
@dataclass
class EuclideanSimilarity:
    path:str
    filename:str
        
    def read_data(self):
        data = pd.read_excel(self.path.joinpath(self.filename))
        return data
    
    def get_sport_pairs(self,data:pd.Series)->list:
        """
        Itertools.permutations() gives you all possible tuples for a series and 
        the size of each tuple can be controlled by the second parameter
        Synatx: permutations(iterator, r)

        """
        columns_list = data
        interactions = list(permutations(columns_list, 2))
        return interactions

    def scale_continuous(sel,df:pd.DataFrame,col_list:list)->pd.DataFrame:
        """
        MinMaxScaler to scale the given series
        """
        scaler = MinMaxScaler()
        for col in col_list:
            print(col)
            scaled_arr = scaler.fit_transform(np.array(df[col]).reshape(-1,1))
            df[f"{col}_scaled"] = scaled_arr
            del df[col]
        return df

    
    def get_euclidian_distance(self,df:pd.DataFrame,interactions:list,scale:bool) -> np.array:            
        """
        Retrieve only attribute columns for euclidian distance calculation
        Calculate the euclidean distance using scipy.spatial i.e distance.euclidian()
        """
        cols = [i for i in df.columns if 'Sport' not in i if 'Rank' not in i if 'Total' not in i]
        if scale:
            scaled_df =  self.scale_continuous(df,cols)
            scaled_cols =  [ i for i in scaled_df.columns if 'scaled' in i]
            distances = [distance.euclidean(scaled_df.loc[df['Sport']==i[0],scaled_cols],scaled_df.loc[df['Sport']==i[1],scaled_cols],2) for i in interactions]

        else:
            distances = [distance.euclidean(df.loc[df['Sport']==i[0],cols],df.loc[df['Sport']==i[1],cols],2) for i in interactions]
        return distances

    def calculate_similarity(self,distances:list) -> list:
        """
        Similarity(x,y) =  1 / (1+dist(x,y))

        """
        similarities = [1/(dist+1) for dist in distances]
        return similarities

    @staticmethod
    def gen_similarity_df(combinations:tuple,distances:list,similarities:list):
        """
        Create the dataframe from the generated Sports-Indices, Distances and Similarity values

        """
        similarity_df = pd.DataFrame({'Sport-Indices':combinations,'Distance':distances,'Similarity':similarities})
        similarity_df['Sport_X'],similarity_df['Sport_Y'] =  similarity_df['Sport-Indices'].str
        return similarity_df

if __name__ == '__main__':
    current_folder =  Path.cwd().parent
    data_folder = current_folder.joinpath("DVI/data/raw/")
    filename = 'DVI_Assignment.xlsx'
    euclid_similarity =  EuclideanSimilarity(data_folder,filename)
    data = euclid_similarity.read_data()
    sport_combinations =  euclid_similarity.get_sport_pairs(data['Sport'])
    distances = euclid_similarity.get_euclidian_distance(data,sport_combinations,True)
    similarities =  euclid_similarity.calculate_similarity(distances)
    similarity_df =  EuclideanSimilarity.gen_similarity_df(sport_combinations,distances,similarities)

Endurance
Strength
Power
Speed
Agility
Flexibility
Nerve
Durability
Hand-Eye Coordination
Analytical Aptitude


  similarity_df['Sport_X'],similarity_df['Sport_Y'] =  similarity_df['Sport-Indices'].str


In [5]:
similarity_df

Unnamed: 0,Sport-Indices,Distance,Similarity,Sport_X,Sport_Y
0,"(Boxing, Ice Hockey)",0.829829,0.546499,Boxing,Ice Hockey
1,"(Boxing, Football)",0.789568,0.558794,Boxing,Football
2,"(Boxing, Basketball)",1.127190,0.470104,Boxing,Basketball
3,"(Boxing, Wrestling)",1.117640,0.472224,Boxing,Wrestling
4,"(Boxing, Martial Arts)",1.097587,0.476738,Boxing,Martial Arts
...,...,...,...,...,...
3535,"(Fishing, Archery)",1.130206,0.469438,Fishing,Archery
3536,"(Fishing, Curling)",0.999814,0.500047,Fishing,Curling
3537,"(Fishing, Bowling)",0.754874,0.569842,Fishing,Bowling
3538,"(Fishing, Shooting)",0.970834,0.507399,Fishing,Shooting


In [6]:
similarity_df.to_csv("../DVI/data/processed/Similarity_Sports.csv",index=False)

In [51]:
similarity_df[similarity_df['Sport_X']=='Figure Skating'].sort_values(by='Similarity',ascending=False)

Unnamed: 0,Sport-Indices,Distance,Similarity,Sport_X,Sport_Y
1085,"(Figure Skating, Skiing: Freestyle)",0.660399,0.602265,Figure Skating,Skiing: Freestyle
1069,"(Figure Skating, Gymnastics)",0.808211,0.553033,Figure Skating,Gymnastics
1093,"(Figure Skating, Track and Field: High Jump)",0.820506,0.549298,Figure Skating,Track and Field: High Jump
1077,"(Figure Skating, Track and Field: Pole Vault)",0.860484,0.537495,Figure Skating,Track and Field: Pole Vault
1094,"(Figure Skating, Track and Field: Long, Triple...",0.876143,0.533009,Figure Skating,"Track and Field: Long, Triple jumps"
1083,"(Figure Skating, Surfing)",0.954715,0.511584,Figure Skating,Surfing
1066,"(Figure Skating, Wrestling)",0.973582,0.506693,Figure Skating,Wrestling
1106,"(Figure Skating, Water Skiing)",1.018623,0.495387,Figure Skating,Water Skiing
1097,"(Figure Skating, Skateboarding)",1.022014,0.494556,Figure Skating,Skateboarding
1072,"(Figure Skating, Skiing: Alpine)",1.043223,0.489423,Figure Skating,Skiing: Alpine


In [48]:
data.head()

Unnamed: 0,Sport,Total,Rank,Endurance_scaled,Strength_scaled,Power_scaled,Speed_scaled,Agility_scaled,Flexibility_scaled,Nerve_scaled,Durability_scaled,Hand-Eye Coordination_scaled,Analytical Aptitude_scaled
0,Boxing,72.375,1,0.884125,0.864242,0.868235,0.621622,0.724138,0.366404,0.888889,1.0,0.694708,0.64381
1,Ice Hockey,71.75,2,0.724218,0.74303,0.78,0.76973,0.914483,0.422773,0.568889,0.967742,0.762551,1.0
2,Football,68.375,3,0.507532,0.924848,0.809412,0.702703,0.742069,0.366404,0.707778,1.0,0.49118,0.929524
3,Basketball,67.875,4,0.739282,0.636364,0.617647,0.715676,0.983448,0.507328,0.361111,0.903226,0.762551,0.977143
4,Wrestling,63.5,5,0.652375,0.894545,0.691765,0.486486,0.742069,0.718151,0.457778,0.774194,0.321574,0.786667
5,Martial Arts,63.375,6,0.463499,0.591515,0.764706,0.621622,0.689655,0.661781,0.638889,0.661935,0.559023,0.881905
6,Tennis,62.75,7,0.724218,0.500606,0.691765,0.661622,0.931034,0.507328,0.235556,0.548387,0.881954,0.857143
7,Gymnastics,62.5,8,0.507532,0.621818,0.632941,0.472432,0.742069,1.0,0.735556,0.790968,0.355495,0.358095
8,Baseball/Softball,62.25,9,0.420626,0.575758,0.750588,0.634595,0.793103,0.408117,0.472222,0.629677,1.0,0.761905
9,Soccer,61.5,10,0.782155,0.424242,0.456471,0.715676,1.0,0.408117,0.305556,0.709677,0.626866,1.0


In [8]:
distances

[6.261309767133393,
 6.131688185157496,
 9.15697548320405,
 9.366771055171576,
 8.992229979265435,
 11.583350119891916,
 11.410258542206659,
 10.0808729780709,
 11.61610950361609,
 8.948195348784024,
 8.218284492520324,
 6.806173668075186,
 9.940462765887716,
 9.117653206829049,
 11.699153815554356,
 11.592359552739902,
 11.86076726017335,
 13.223048060110804,
 9.878137476265454,
 12.723505806184082,
 15.43576366753521,
 11.730848221676046,
 13.652523576247729,
 12.56886629732372,
 14.162944609084656,
 12.490356279946544,
 12.65678474178968,
 12.860031104161454,
 17.45040400678449,
 13.537318789184217,
 16.074831258834415,
 15.37250792811635,
 15.093217019575386,
 16.089791794799584,
 14.933365327346682,
 13.846263033757523,
 17.24812453572852,
 14.9198793560806,
 14.045782285084732,
 15.85020504599231,
 14.526369126522981,
 15.74830149571693,
 15.677097945729626,
 16.60264436769035,
 13.877557421967314,
 19.12986147362286,
 16.526124772613816,
 15.075111939882904,
 16.527637459721824,

In [15]:
similarities

[0.5464992687641909,
 0.5587940342263228,
 0.4701036932375471,
 0.4722236916931923,
 0.4767381962491177,
 0.4185953135316908,
 0.4212371739593795,
 0.45174479900964093,
 0.4122565309972795,
 0.48029376315032457,
 0.5132418615085499,
 0.5466463778780348,
 0.45623814514919664,
 0.46824468723422635,
 0.40607282208167794,
 0.4200243396637145,
 0.3943553436911735,
 0.37913144900230017,
 0.44421610677078743,
 0.40135682519709087,
 0.3506397696454211,
 0.41273210085636103,
 0.3777502913202079,
 0.39094306248278726,
 0.3732708722153197,
 0.392669283045627,
 0.3851856423489854,
 0.37927331694723904,
 0.32569909805373953,
 0.3752926136702264,
 0.33674478140695285,
 0.34054898570960496,
 0.3463256030657856,
 0.3309565470524872,
 0.34519520220062355,
 0.3656198561092647,
 0.3156927882567516,
 0.34897853065786294,
 0.36824112996670844,
 0.33833098490970653,
 0.3557861340326248,
 0.3390885781544874,
 0.32904112919542916,
 0.323331451736405,
 0.36419805050602155,
 0.3049829975814759,
 0.3272257374892

In [13]:
similarities

[0.1377162016315933,
 0.14021925440896377,
 0.09845450564034904,
 0.09646205117080686,
 0.10007776062751446,
 0.07947009265991793,
 0.08057849855416395,
 0.09024559725384496,
 0.0792637381368143,
 0.1005207442093737,
 0.1084800540503383,
 0.12810373462349267,
 0.09140381183125021,
 0.098837149243763,
 0.07874540418395169,
 0.07941323433561072,
 0.07775585855571428,
 0.07030841742035213,
 0.09192750157661249,
 0.07286767784580091,
 0.06084292888533392,
 0.07854936156550515,
 0.06824762948145234,
 0.07369812466920961,
 0.06595025081083952,
 0.07412702668842802,
 0.07322367738139754,
 0.07214991023358898,
 0.05419935518118113,
 0.06878847568122405,
 0.05856573250073004,
 0.06107799760369697,
 0.06213798016789466,
 0.05851446360536116,
 0.0627613802517717,
 0.06735701756908077,
 0.05480015209464796,
 0.06281454636891139,
 0.0664638089965801,
 0.05934645882768306,
 0.06440655840725479,
 0.05970754707608599,
 0.05996247088397427,
 0.05680964627311905,
 0.06721533459003556,
 0.049677440717133