In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from category_encoders import OrdinalEncoder
import numpy as np
from feature_engine.selection import DropConstantFeatures
from sklearn.metrics.pairwise import paired_distances
from statistics import mode
from sklearn.metrics import classification_report,confusion_matrix,f1_score
from sklearn.model_selection import train_test_split
import itertools
from collections import Counter
from sklearn.linear_model import LogisticRegression


In [2]:
data_for_train_test_split = pd.read_csv("../../data/train.csv")
data_for_train_test_split = data_for_train_test_split.loc[:,((data_for_train_test_split.columns!='PassengerId') & (data_for_train_test_split.columns!='Name'))]

train = data_for_train_test_split[0:int(0.5*len(data_for_train_test_split))]
test = data_for_train_test_split[int(0.5*len(data_for_train_test_split)):]



In [3]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,445.0,445.0,357.0,445.0,445.0,445.0
mean,0.391011,2.34382,28.660364,0.588764,0.391011,32.375879
std,0.488526,0.817042,14.040557,1.181448,0.796903,49.422441
min,0.0,1.0,0.83,0.0,0.0,0.0
25%,0.0,2.0,20.0,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,36.0,1.0,0.0,30.0708
max,1.0,3.0,71.0,8.0,5.0,512.3292


In [4]:
test.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,446.0,446.0,357.0,446.0,446.0,446.0
mean,0.376682,2.273543,30.737871,0.457399,0.372197,32.032922
std,0.485098,0.854108,14.944219,1.015266,0.815875,50.017258
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,21.0,0.0,0.0,7.8958
50%,0.0,3.0,30.0,0.0,0.0,14.47915
75%,1.0,3.0,39.0,1.0,0.0,31.275
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [6]:
test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,1,1,male,4.0,0,2,33638,81.8583,A34,S
446,1,2,female,13.0,0,1,250644,19.5,,S
447,1,1,male,34.0,0,0,113794,26.55,,S
448,1,3,female,5.0,2,1,2666,19.2583,,C
449,1,1,male,52.0,0,0,113786,30.5,C104,S


In [7]:
test['Embarked'].value_counts(dropna=False)

S      324
C       87
Q       34
NaN      1
Name: Embarked, dtype: int64

In [8]:
test=test.dropna(subset=['Survived'])
train=train.dropna(subset=['Survived'])

In [9]:
test['Survived'].value_counts(dropna=False)

0    278
1    168
Name: Survived, dtype: int64

In [10]:

y_train = train.loc[:,train.columns=='Survived']
X_train = train.loc[:,train.columns!='Survived']
y_test = test.loc[:,test.columns=='Survived']
X_test = test.loc[:,test.columns!='Survived']

In [11]:
y_train

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
440,1
441,0
442,0
443,1


In [12]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


In [13]:
pipeline =Pipeline([
            # drop constant features
            ('dropconstantfeatures',DropConstantFeatures(tol=0.8, missing_values='ignore')),
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            ('floatimputer', MeanMedianImputer(
                imputation_method='mean', variables=float_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),


 ])

In [14]:
X_train = pipeline.fit_transform(X_train,y_train)

In [15]:
X_test = pipeline.transform(X_test)

In [16]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,1,22.000000,1,0,1,7.2500,1,1
1,1,2,38.000000,1,0,2,71.2833,2,2
2,3,2,26.000000,0,0,3,7.9250,1,1
3,1,2,35.000000,1,0,4,53.1000,3,1
4,3,1,35.000000,0,0,5,8.0500,1,1
...,...,...,...,...,...,...,...,...,...
440,2,2,45.000000,1,1,281,26.2500,1,1
441,3,1,20.000000,0,0,374,9.5000,1,1
442,3,1,25.000000,1,0,375,7.7750,1,1
443,2,2,28.000000,0,0,376,13.0000,1,1


In [17]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,1,1,4.000000,0,2,-1.0,81.8583,-1.0,1
446,2,2,13.000000,0,1,243.0,19.5000,1.0,1
447,1,1,34.000000,0,0,-1.0,26.5500,1.0,1
448,3,2,5.000000,2,1,-1.0,19.2583,1.0,2
449,1,1,52.000000,0,0,-1.0,30.5000,-1.0,1
...,...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,-1.0,13.0000,1.0,1
887,1,2,19.000000,0,0,-1.0,30.0000,-1.0,1
888,3,2,28.660364,1,2,-1.0,23.4500,1.0,1
889,1,1,26.000000,0,0,-1.0,30.0000,-1.0,2


In [18]:
# print(X_test.isnull().mean())
# print(X_train.isnull().mean())
# print(y_test.isnull().mean())
# print(y_train.isnull().mean())
print('y_train.value_counts()')
print(y_train.value_counts())
print('y_test.value_counts()')
print(y_test.value_counts())

y_train.value_counts()
Survived
0           271
1           174
dtype: int64
y_test.value_counts()
Survived
0           278
1           168
dtype: int64


In [19]:
# test 

print('manhattan distances is -->',paired_distances([[0,1]], [[0,1]],metric='manhattan'))
print('cosine distances is -->',paired_distances([[0,1]], [[0,1]],metric='cosine'))
print('euclidean distances is -->',paired_distances([[0,1]], [[0,1]],metric='euclidean'))

print('manhattan distances is -->',paired_distances([[0,1]], [[1,0]],metric='manhattan'))
print('cosine distances is -->',paired_distances([[0,1]], [[1,0]],metric='cosine'))
print('euclidean distances is -->',paired_distances([[0,1]], [[1,0]],metric='euclidean'))

manhattan distances is --> [0.]
cosine distances is --> [0.]
euclidean distances is --> [0.]
manhattan distances is --> [2.]
cosine distances is --> [1.]
euclidean distances is --> [1.41421356]


In [20]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,1,22.000000,1,0,1,7.2500,1,1
1,1,2,38.000000,1,0,2,71.2833,2,2
2,3,2,26.000000,0,0,3,7.9250,1,1
3,1,2,35.000000,1,0,4,53.1000,3,1
4,3,1,35.000000,0,0,5,8.0500,1,1
...,...,...,...,...,...,...,...,...,...
440,2,2,45.000000,1,1,281,26.2500,1,1
441,3,1,20.000000,0,0,374,9.5000,1,1
442,3,1,25.000000,1,0,375,7.7750,1,1
443,2,2,28.000000,0,0,376,13.0000,1,1


In [21]:
def fit(*args, **kwargs):
    X_valid = kwargs['X_valid']
    y_valid = kwargs['y_valid']
    split_dict = {}
    trained = {}
    metric = kwargs['metric']
    X_train = kwargs['X_train']
    y_train = kwargs['y_train']
    X_train.reset_index(drop=True)
    y_train.reset_index(drop=True)
    if X_valid is not None:
        X_train_new, X_valid, y_train_new, y_valid = train_test_split(X_train, y_train, test_size=0.10, random_state=42)
    else:
        X_train_new=X_train.copy()
        y_train_new=y_train.copy()


    threshold = kwargs['threshold']
    number_of_intervals = kwargs['number_of_intervals']
    for col in X_train_new.columns:
        #print(col,'---->',X_train_new[col].dtypes)
        #if X_train_new[col].dtypes=='float64':
        min_col = X_train_new[col].min()
        max_col = X_train_new[col].max()
        len_col = int((max_col-min_col)/number_of_intervals)
        split_dict[col] = [min_col,max_col,len_col]
    for col in X_train_new.columns:
        if split_dict[col][2]!=0:
            #print(col,'---->',X_train_new[col].dtypes)
            X_train_new[col]=((X_train_new[col]-split_dict[col][0])/split_dict[col][2]).round(decimals=0).astype(int)
    
    for index, row in X_train_new.iterrows():
        rhs = []
        lhs = []
        #sampled_row = row.to_list()
        rhs.append(X_train_new.loc[index].to_list())
        lhs.append(y_train_new.loc[index].to_list())

        #rhs.append(sampled_row)
        #lhs.append(y_train_new.iloc[index][0])
        #print("lhs",lhs)
        for i, _ in X_train_new.iterrows():
            if i!=index:
                #print("index is --> ", index, "and i is --->",i)
                sampled_row = X_train_new.loc[index].to_list()
                candidate_row = X_train_new.loc[i].to_list()
                distance_metric = paired_distances([sampled_row], [candidate_row],metric=metric)
                if distance_metric < threshold:
                    #print(sampled_row)
                    #print(candidate_row)
                    #print('similarity between',i,index,'is:',distance_metric)
                    rhs.append(candidate_row)
                    lhs.append(y_train_new.loc[i].to_list())
                    #print('index is', index)
                    #print('row',row)
                    #print('for row', sampled_row)
                    #print('rhs',rhs)
            
        #print('for index, lhs is ----> ',index, lhs)
        lhs = list(itertools.chain(*lhs))
        #print('lhs value counts')
        #print(Counter(lhs).keys())
        #print(Counter(lhs).values())
        if len(lhs)>0:
            lhs=mode(lhs)
        #print('mode lhs')
        #print(lhs)
        rhs = [np.mean(rhs, 0).tolist()]
        trained[index]=[row,rhs,lhs]
    #index_for_print = 9
    #print('rhs for index',index_for_print, "is --> ",trained[index_for_print][1])
    #print('lhs for index',index_for_print, "is --> ",trained[index_for_print][2])
    return trained,split_dict,number_of_intervals,metric

In [22]:
trained,split_dict,number_of_intervals, metric=fit(X_train=X_train,y_train=y_train,number_of_intervals=6,threshold=0.7,metric = 'manhattan',X_valid=None,y_valid=None)


In [23]:

for index in range(100):
    print(trained[index])

[Pclass      3
Sex         1
Age         2
SibSp       1
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    1
Name: 0, dtype: int64, [[3.0, 1.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]], 0]
[Pclass      1
Sex         2
Age         3
SibSp       1
Parch       0
Ticket      0
Fare        1
Cabin       0
Embarked    2
Name: 1, dtype: int64, [[1.0, 2.0, 3.0, 1.0, 0.0, 0.0, 1.0, 0.0, 2.0]], 1]
[Pclass      3
Sex         2
Age         2
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    1
Name: 2, dtype: int64, [[3.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]], 1]
[Pclass      1
Sex         2
Age         3
SibSp       1
Parch       0
Ticket      0
Fare        1
Cabin       0
Embarked    1
Name: 3, dtype: int64, [[1.0, 2.0, 3.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0]], 1]
[Pclass      3
Sex         1
Age         3
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    1
Name: 4, dtype: int64, [[3.0, 1.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [24]:
def predict(*args, **kwargs):

    
    split_dict = {}
    trained=kwargs['trained']
    split_dict=kwargs['split_dict']
    number_of_intervals=kwargs['number_of_intervals']
    metric=kwargs['metric']
    X_test = kwargs['X_test']
    for col in X_test.columns:
        #print(col,'---->',X_train[col].dtypes)
        #if X_train[col].dtypes=='float64':
        min_col = X_test[col].min()
        max_col = X_test[col].max()
        len_col = int((max_col-min_col)/number_of_intervals)
        split_dict[col] = [min_col,max_col,len_col]
        for col in X_test.columns:
            if split_dict[col][2]!=0:
                #print(col,'---->',X_test[col].dtypes)
                X_test[col]=((X_test[col]-split_dict[col][0])/split_dict[col][2]).round(decimals=0).astype(int)
    predictions =[]
    for index, row in X_test.iterrows():
        max_membership_for_sample = []
        sampled_row = row.to_list()
        for index_in_train in trained:
            _,rhs,_ = trained[index_in_train]
            max_ds = 0
            for rh in rhs:
                #print(rhs)
                paired_d = paired_distances([sampled_row], [rh],metric=metric)
                max_ds = max_ds + paired_d[0]/len(rhs)
                #print(max_ds)
            max_membership_for_sample.append(max_ds)
        #print('max_membership_for_sample',max_membership_for_sample)
        #print('len max_membership_for_sample',len(max_membership_for_sample))
        #print('max(max_membership_for_sample)',max(max_membership_for_sample))
        min_index = min(range(len(max_membership_for_sample)), key=max_membership_for_sample.__getitem__)
        #print('min_index -->',min_index)
        #print('index in train of all data -->',trained[min_index])
        y_forecast = trained[min_index][2]
        #print('possible forecast for y is -->',y_forecast)
        predictions.append(y_forecast)

    y_pred = pd.DataFrame(columns=['y_pred'],data=predictions)
    #print('y_pred -->')
    #print(y_pred)


    return y_pred
        




In [34]:
y_pred = predict(X_test=X_test,trained=trained,split_dict=split_dict,number_of_intervals=number_of_intervals,metric=metric)




In [35]:
y_test

Unnamed: 0,Survived
445,1
446,1
447,1
448,1
449,1
...,...
886,0
887,1
888,0
889,1


In [36]:
y_pred

Unnamed: 0,y_pred
0,1
1,1
2,1
3,1
4,1
...,...
441,0
442,0
443,1
444,1


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.67      0.72       278
           1       0.56      0.70      0.62       168

    accuracy                           0.68       446
   macro avg       0.67      0.68      0.67       446
weighted avg       0.70      0.68      0.68       446



In [38]:
print(confusion_matrix(y_test, y_pred))

[[186  92]
 [ 51 117]]


#### Xgboost

In [39]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,1,22.000000,1,0,1,7.2500,1,1
1,1,2,38.000000,1,0,2,71.2833,2,2
2,3,2,26.000000,0,0,3,7.9250,1,1
3,1,2,35.000000,1,0,4,53.1000,3,1
4,3,1,35.000000,0,0,5,8.0500,1,1
...,...,...,...,...,...,...,...,...,...
440,2,2,45.000000,1,1,281,26.2500,1,1
441,3,1,20.000000,0,0,374,9.5000,1,1
442,3,1,25.000000,1,0,375,7.7750,1,1
443,2,2,28.000000,0,0,376,13.0000,1,1


In [40]:
y_train

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
440,1
441,0
442,0
443,1


In [41]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,1,1,0,0,2,0,0,0,1
446,2,2,0,0,1,0,0,0,1
447,1,1,0,0,0,0,0,0,1
448,3,2,0,2,1,0,0,0,2
449,1,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
886,2,1,0,0,0,0,0,0,1
887,1,2,0,0,0,0,0,0,1
888,3,2,0,1,2,0,0,0,1
889,1,1,0,0,0,0,0,0,2


In [42]:
y_test

Unnamed: 0,Survived
445,1
446,1
447,1
448,1
449,1
...,...
886,0
887,1
888,0
889,1


In [30]:
import xgboost

model = xgboost.XGBClassifier().fit(X_train,y_train)


In [31]:
y_pred=model.predict(X_test)

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.03      0.06       278
           1       0.38      1.00      0.55       168

    accuracy                           0.39       446
   macro avg       0.69      0.51      0.31       446
weighted avg       0.77      0.39      0.24       446



In [33]:
print(confusion_matrix(y_test, y_pred))

[[  8 270]
 [  0 168]]


#### Logistic Regression

In [64]:
model_lr = LogisticRegression(max_iter=1000).fit(X_train,y_train.values.ravel())


In [65]:
y_pred=model_lr.predict(X_test)

In [66]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.74      0.80       278
           1       0.65      0.80      0.72       168

    accuracy                           0.76       446
   macro avg       0.75      0.77      0.76       446
weighted avg       0.78      0.76      0.77       446



In [67]:
print(confusion_matrix(y_test, y_pred))

[[206  72]
 [ 34 134]]


#### SVC

In [68]:
from sklearn.svm import SVC

In [69]:
model_svc= SVC().fit(X_train,y_train.values.ravel())

In [70]:
y_pred=model_svc.predict(X_test)

In [71]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.77       278
           1       0.00      0.00      0.00       168

    accuracy                           0.62       446
   macro avg       0.31      0.50      0.38       446
weighted avg       0.39      0.62      0.48       446



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
import lightgbm


In [73]:
model_gbm= lightgbm.LGBMClassifier().fit(X_train,y_train.values.ravel())

In [74]:
y_pred=model_gbm.predict(X_test)

In [75]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.03      0.06       278
           1       0.38      1.00      0.56       168

    accuracy                           0.40       446
   macro avg       0.69      0.52      0.31       446
weighted avg       0.77      0.40      0.25       446



In [76]:
print(confusion_matrix(y_test, y_pred))

[[  9 269]
 [  0 168]]
