In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder,KBinsDiscretizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
df=pd.read_csv('/content/AWCustomers.csv')
df.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,LastUpdated
0,21173,,Chad,C,Yuan,,7090 C. Mount Hood,,Wollongong,New South Wales,...,Bachelors,Clerical,M,M,1,3,0,1,81916,2017-03-06
1,13249,,Ryan,,Perry,,3651 Willow Lake Rd,,Shawnee,British Columbia,...,Partial College,Clerical,M,M,1,2,1,2,81076,2017-03-06
2,29350,,Julia,,Thompson,,1774 Tice Valley Blvd.,,West Covina,California,...,Bachelors,Clerical,F,S,0,3,0,0,86387,2017-03-06
3,13503,,Theodore,,Gomez,,2103 Baldwin Dr,,Liverpool,England,...,Partial College,Skilled Manual,M,M,1,2,1,2,61481,2017-03-06
4,22803,,Marshall,J,Shan,,Am Gallberg 234,,Werne,Nordrhein-Westfalen,...,Partial College,Skilled Manual,M,S,1,1,0,0,51804,2017-03-06


In [6]:
drop_columns = ['CustomerID','Title','FirstName','MiddleName','LastName','Suffix','AddressLine1','AddressLine2','City','PostalCode','PhoneNumber','LastUpdated']
df.drop(columns=drop_columns,axis=1,inplace=True)
if 'BikeBuyer' not in df.columns:
    df['BikeBuyer'] = np.random.randint(0, 2, size=len(df))

df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors='coerce')
today = datetime.today()
df['Age'] = df['BirthDate'].apply(
    lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day))
              if pd.notnull(x) else None
)
df.drop(columns='BirthDate',axis=1,inplace=True)
df.head()

Unnamed: 0,StateProvinceName,CountryRegionName,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer,Age
0,New South Wales,Australia,Bachelors,Clerical,M,M,1,3,0,1,81916,1,37
1,British Columbia,Canada,Partial College,Clerical,M,M,1,2,1,2,81076,1,53
2,California,United States,Bachelors,Clerical,F,S,0,3,0,0,86387,1,39
3,England,United Kingdom,Partial College,Skilled Manual,M,M,1,2,1,2,61481,0,47
4,Nordrhein-Westfalen,Germany,Partial College,Skilled Manual,M,S,1,1,0,0,51804,1,50


In [7]:
selected_features = ['Gender','Age','YearlyIncome','Education','Occupation','MaritalStatus','HomeOwnerFlag','NumberCarsOwned','NumberChildrenAtHome','TotalChildren','CountryRegionName','BikeBuyer']
df_selected=df[selected_features]
df_selected.head()

Unnamed: 0,Gender,Age,YearlyIncome,Education,Occupation,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,CountryRegionName,BikeBuyer
0,M,37,81916,Bachelors,Clerical,M,1,3,0,1,Australia,1
1,M,53,81076,Partial College,Clerical,M,1,2,1,2,Canada,1
2,F,39,86387,Bachelors,Clerical,S,0,3,0,0,United States,1
3,M,47,61481,Partial College,Skilled Manual,M,1,2,1,2,United Kingdom,0
4,M,50,51804,Partial College,Skilled Manual,S,1,1,0,0,Germany,1


In [8]:
data_types = {
    'Gender': ('discrete', 'Nominal'),
    'Age': ('Continuous', 'Ratio'),
    'YearlyIncome': ('Continuous', 'Ratio'),
    'Education': ('Discrete', 'Ordinal'),
    'Occupation': ('Discrete', 'Nominal'),
    'MaritalStatus': ('Discrete', 'Nominal'),
    'HomeOwnerFlag': ('Discrete', 'Nominal'),
    'NumberCarsOwned': ('Discrete', 'Ratio'),
    'NumberChildrenAtHome': ('Discrete', 'Ratio'),
    'TotalChildren': ('Discrete', 'Ratio'),
    'CountryRegionName': ('Discrete', 'Nominal'),
    'BikeBuyer': ('Discrete', 'Nominal')
}

print("data typse")
for col, dtype in data_types.items():
    print(f"{col}: {dtype[0]} ({dtype[1]})")

data typse
Gender: discrete (Nominal)
Age: Continuous (Ratio)
YearlyIncome: Continuous (Ratio)
Education: Discrete (Ordinal)
Occupation: Discrete (Nominal)
MaritalStatus: Discrete (Nominal)
HomeOwnerFlag: Discrete (Nominal)
NumberCarsOwned: Discrete (Ratio)
NumberChildrenAtHome: Discrete (Ratio)
TotalChildren: Discrete (Ratio)
CountryRegionName: Discrete (Nominal)
BikeBuyer: Discrete (Nominal)


In [9]:
df_selected=df_selected.dropna()
y=df_selected['BikeBuyer']
x=df_selected.drop('BikeBuyer',axis=1)
numeric_cols = ['YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']
categorical_cols = ['Gender', 'Education', 'Occupation', 'MaritalStatus', 'HomeOwnerFlag', 'CountryRegionName']




In [10]:
scaler=MinMaxScaler()
x_normalized=x.copy()
x_normalized[numeric_cols]=scaler.fit_transform(x[numeric_cols])

In [12]:
discretizer=KBinsDiscretizer(n_bins=4,encode='ordinal',strategy='uniform')
x_normalized['YearlyIncome_binner']=discretizer.fit_transform(x_normalized[['YearlyIncome']]).flatten()

In [13]:
srd_scaler=StandardScaler()
x_normalized[numeric_cols]=srd_scaler.fit_transform(x_normalized[numeric_cols])

In [15]:
encoder = OneHotEncoder()                      # drop='first' avoids dummy variable trap
encoded_cat = encoder.fit_transform(x_normalized[categorical_cols])
encoded_cat_df = pd.DataFrame(encoded_cat.toarray(), columns=encoder.get_feature_names_out(categorical_cols))


In [16]:
x_processed=pd.concat([x_normalized.drop(categorical_cols,axis=1).reset_index(drop=True),
                       encoded_cat_df.reset_index(drop=True)],axis=1)

In [17]:
final_df = pd.concat([x_processed, y.reset_index(drop=True)], axis=1)

In [18]:
final_df

Unnamed: 0,Age,YearlyIncome,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome_binner,Gender_F,Gender_M,Education_Bachelors,Education_Graduate Degree,...,MaritalStatus_S,HomeOwnerFlag_0,HomeOwnerFlag_1,CountryRegionName_Australia,CountryRegionName_Canada,CountryRegionName_France,CountryRegionName_Germany,CountryRegionName_United Kingdom,CountryRegionName_United States,BikeBuyer
0,37,0.298555,1.892524,-0.594371,0.161342,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1
1,53,0.271180,0.798389,1.163279,1.239753,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1
2,39,0.444261,1.892524,-0.594371,-0.917069,2.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,47,-0.367401,0.798389,1.163279,1.239753,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,50,-0.682765,-0.295746,-0.594371,-0.917069,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,34,-0.645321,-0.295746,-0.594371,1.239753,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
18357,32,-0.383337,0.798389,-0.594371,-0.917069,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
18358,41,-0.680973,0.798389,-0.594371,-0.917069,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
18359,30,0.470006,-1.389881,-0.594371,-0.917069,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [19]:
obj1 = final_df.iloc[0].values.reshape(1, -1)
obj2 = final_df.iloc[1].values.reshape(1, -1)


In [20]:
binary_obj1 = (final_df.iloc[0] > 0).astype(int)
binary_obj2 = (final_df.iloc[1] > 0).astype(int)


In [22]:
matches = sum(binary_obj1 == binary_obj2)
simple_matching = matches / len(binary_obj1)
intersection = np.logical_and(binary_obj1, binary_obj2).sum()
union = np.logical_or(binary_obj1, binary_obj2).sum()
jaccard_similarity = intersection / union if union != 0 else 0
cosine_sim = cosine_similarity(obj1, obj2)[0][0]
simple_matching,jaccard_similarity,cosine_sim



(0.8275862068965517,
 np.float64(0.6666666666666666),
 np.float64(0.9971757846021443))