In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from category_encoders import OrdinalEncoder
import numpy as np
from feature_engine.selection import DropConstantFeatures
from sklearn.metrics.pairwise import paired_distances
from statistics import mode
from sklearn.metrics import classification_report,confusion_matrix,f1_score
from sklearn.model_selection import train_test_split
import itertools
from collections import Counter


In [2]:
train = pd.read_csv("..//data/train.csv")
test = pd.read_csv("../data/train.csv")
train = train.loc[:,((train.columns!='PassengerId') & (train.columns!='Name'))]
test = test.loc[:,((test.columns!='PassengerId') & (test.columns!='Name'))]

In [3]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [5]:
test['Embarked'].value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [6]:
test=test.dropna(subset=['Survived'])
train=train.dropna(subset=['Survived'])

In [7]:
test['Survived'].value_counts(dropna=False)

0    549
1    342
Name: Survived, dtype: int64

In [8]:

y_train = train.loc[:,train.columns=='Survived']
X_train = train.loc[:,train.columns!='Survived']
y_test = test.loc[:,test.columns=='Survived']
X_test = test.loc[:,test.columns!='Survived']

In [9]:
y_train

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [10]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


In [11]:
pipeline =Pipeline([
            # drop constant features
            ('dropconstantfeatures',DropConstantFeatures(tol=0.8, missing_values='ignore')),
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            ('floatimputer', MeanMedianImputer(
                imputation_method='mean', variables=float_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),


 ])

In [12]:
X_train = pipeline.fit_transform(X_train,y_train)

In [13]:
X_test = pipeline.transform(X_test)

In [14]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,1,22.000000,1,0,1,7.2500,1,1
1,1,2,38.000000,1,0,2,71.2833,2,2
2,3,2,26.000000,0,0,3,7.9250,1,1
3,1,2,35.000000,1,0,4,53.1000,3,1
4,3,1,35.000000,0,0,5,8.0500,1,1
...,...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,678,13.0000,1,1
887,1,2,19.000000,0,0,679,30.0000,147,1
888,3,2,29.699118,1,2,615,23.4500,1,1
889,1,1,26.000000,0,0,680,30.0000,148,2


In [15]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,1,22.000000,1,0,1,7.2500,1,1
1,1,2,38.000000,1,0,2,71.2833,2,2
2,3,2,26.000000,0,0,3,7.9250,1,1
3,1,2,35.000000,1,0,4,53.1000,3,1
4,3,1,35.000000,0,0,5,8.0500,1,1
...,...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,678,13.0000,1,1
887,1,2,19.000000,0,0,679,30.0000,147,1
888,3,2,29.699118,1,2,615,23.4500,1,1
889,1,1,26.000000,0,0,680,30.0000,148,2


In [16]:
# print(X_test.isnull().mean())
# print(X_train.isnull().mean())
# print(y_test.isnull().mean())
# print(y_train.isnull().mean())
print('y_train.value_counts()')
print(y_train.value_counts())
print('y_test.value_counts()')
print(y_test.value_counts())

y_train.value_counts()
Survived
0           549
1           342
dtype: int64
y_test.value_counts()
Survived
0           549
1           342
dtype: int64


In [17]:
# test 

print('manhattan distances is -->',paired_distances([[0,1]], [[0,1]],metric='manhattan'))
print('cosine distances is -->',paired_distances([[0,1]], [[0,1]],metric='cosine'))
print('euclidean distances is -->',paired_distances([[0,1]], [[0,1]],metric='euclidean'))

print('manhattan distances is -->',paired_distances([[0,1]], [[1,0]],metric='manhattan'))
print('cosine distances is -->',paired_distances([[0,1]], [[1,0]],metric='cosine'))
print('euclidean distances is -->',paired_distances([[0,1]], [[1,0]],metric='euclidean'))

manhattan distances is --> [0.]
cosine distances is --> [0.]
euclidean distances is --> [0.]
manhattan distances is --> [2.]
cosine distances is --> [1.]
euclidean distances is --> [1.41421356]


In [18]:
def fit(*args, **kwargs):
    X_valid = kwargs['X_valid']
    y_valid = kwargs['y_valid']
    split_dict = {}
    trained = {}
    metric = kwargs['metric']
    X_train = kwargs['X_train']
    y_train = kwargs['y_train']
    X_train.reset_index(drop=True)
    y_train.reset_index(drop=True)
    if X_valid is not None:
        X_train_new, X_valid, y_train_new, y_valid = train_test_split(X_train, y_train, test_size=0.10, random_state=42)
    else:
        X_train_new=X_train.copy()
        y_train_new=y_train.copy()


    threshold = kwargs['threshold']
    number_of_intervals = kwargs['number_of_intervals']
    for col in X_train_new.columns:
        #print(col,'---->',X_train_new[col].dtypes)
        #if X_train_new[col].dtypes=='float64':
        min_col = X_train_new[col].min()
        max_col = X_train_new[col].max()
        len_col = int((max_col-min_col)/number_of_intervals)
        split_dict[col] = [min_col,max_col,len_col]
    for col in X_train_new.columns:
        if split_dict[col][2]!=0:
            #print(col,'---->',X_train_new[col].dtypes)
            X_train_new[col]=((X_train_new[col]-split_dict[col][0])/split_dict[col][2]).round(decimals=0).astype(int)
    
    for index, row in X_train_new.iterrows():
        rhs = []
        lhs = []
        #sampled_row = row.to_list()
        rhs.append(X_train_new.loc[index].to_list())
        lhs.append(y_train_new.loc[index].to_list())

        #rhs.append(sampled_row)
        #lhs.append(y_train_new.iloc[index][0])
        #print("lhs",lhs)
        for i, _ in X_train_new.iterrows():
            if i!=index:
                #print("index is --> ", index, "and i is --->",i)
                sampled_row = X_train_new.loc[index].to_list()
                candidate_row = X_train_new.loc[i].to_list()
                distance_metric = paired_distances([sampled_row], [candidate_row],metric=metric)
                if distance_metric < threshold:
                    #print(sampled_row)
                    #print(candidate_row)
                    #print('similarity between',i,index,'is:',distance_metric)
                    rhs.append(candidate_row)
                    lhs.append(y_train_new.loc[i].to_list())
                    #print('index is', index)
                    #print('row',row)
                    #print('for row', sampled_row)
                    #print('rhs',rhs)
            
        #print('for index, lhs is ----> ',index, lhs)
        lhs = list(itertools.chain(*lhs))
        #print('lhs value counts')
        #print(Counter(lhs).keys())
        #print(Counter(lhs).values())
        if len(lhs)>0:
            lhs=mode(lhs)
        #print('mode lhs')
        #print(lhs)
        rhs = [np.mean(rhs, 0).tolist()]
        trained[index]=[row,rhs,lhs]
    #index_for_print = 9
    #print('rhs for index',index_for_print, "is --> ",trained[index_for_print][1])
    #print('lhs for index',index_for_print, "is --> ",trained[index_for_print][2])
    return trained,split_dict,number_of_intervals,metric

In [19]:
trained,split_dict,number_of_intervals, metric=fit(X_train=X_train,y_train=y_train,number_of_intervals=6,threshold=1,metric = 'euclidean',X_valid=None,y_valid=None)


In [20]:

for index in range(100):
    print(trained[index])

[Pclass      3
Sex         1
Age         2
SibSp       1
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    1
Name: 0, dtype: int64, [[3.0, 1.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]], 0]
[Pclass      1
Sex         2
Age         3
SibSp       1
Parch       0
Ticket      0
Fare        1
Cabin       0
Embarked    2
Name: 1, dtype: int64, [[1.0, 2.0, 3.0, 1.0, 0.0, 0.0, 1.0, 0.0, 2.0]], 1]
[Pclass      3
Sex         2
Age         2
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    1
Name: 2, dtype: int64, [[3.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]], 1]
[Pclass      1
Sex         2
Age         3
SibSp       1
Parch       0
Ticket      0
Fare        1
Cabin       0
Embarked    1
Name: 3, dtype: int64, [[1.0, 2.0, 3.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0]], 1]
[Pclass      3
Sex         1
Age         3
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    1
Name: 4, dtype: int64, [[3.0, 1.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [21]:
def predict(*args, **kwargs):

    
    split_dict = {}
    trained=kwargs['trained']
    split_dict=kwargs['split_dict']
    number_of_intervals=kwargs['number_of_intervals']
    metric=kwargs['metric']
    X_test = kwargs['X_test']
    for col in X_test.columns:
        #print(col,'---->',X_train[col].dtypes)
        #if X_train[col].dtypes=='float64':
        min_col = X_test[col].min()
        max_col = X_test[col].max()
        len_col = int((max_col-min_col)/number_of_intervals)
        split_dict[col] = [min_col,max_col,len_col]
        for col in X_test.columns:
            if split_dict[col][2]!=0:
                #print(col,'---->',X_test[col].dtypes)
                X_test[col]=((X_test[col]-split_dict[col][0])/split_dict[col][2]).round(decimals=0).astype(int)
    predictions =[]
    for index, row in X_test.iterrows():
        max_membership_for_sample = []
        sampled_row = row.to_list()
        for index_in_train in trained:
            _,rhs,_ = trained[index_in_train]
            max_ds = 0
            for rh in rhs:
                #print(rhs)
                paired_d = paired_distances([sampled_row], [rh],metric=metric)
                max_ds = max_ds + paired_d[0]/len(rhs)
                #print(max_ds)
            max_membership_for_sample.append(max_ds)
        #print('max_membership_for_sample',max_membership_for_sample)
        #print('len max_membership_for_sample',len(max_membership_for_sample))
        #print('max(max_membership_for_sample)',max(max_membership_for_sample))
        min_index = min(range(len(max_membership_for_sample)), key=max_membership_for_sample.__getitem__)
        #print('min_index -->',min_index)
        #print('index in train of all data -->',trained[min_index])
        y_forecast = trained[min_index][2]
        #print('possible forecast for y is -->',y_forecast)
        predictions.append(y_forecast)

    y_pred = pd.DataFrame(columns=['y_pred'],data=predictions)
    #print('y_pred -->')
    #print(y_pred)


    return y_pred
        




In [22]:
y_pred = predict(X_test=X_test,trained=trained,split_dict=split_dict,number_of_intervals=number_of_intervals,metric=metric)

res = pd.DataFrame(data=pd.concat([y_pred,y_test], axis=1))

print(res)

     y_pred  Survived
0         1         0
1         1         1
2         0         1
3         1         1
4         0         0
..      ...       ...
886       0         0
887       1         1
888       1         0
889       0         1
890       1         0

[891 rows x 2 columns]


In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.70      0.73       549
           1       0.58      0.66      0.62       342

    accuracy                           0.69       891
   macro avg       0.68      0.68      0.68       891
weighted avg       0.70      0.69      0.69       891



In [24]:
print(confusion_matrix(y_test, y_pred))

[[385 164]
 [115 227]]
