In [95]:
import sklearn
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,FunctionTransformer,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

import numpy as np

from scipy import stats

import pandas as pd

In [298]:
df=pd.read_csv('/mnt/All/Downloads/Datasets/titanic/train.csv')

df1=df.copy()
dfy=df1['Survived']
dfx=df1.drop(['Survived','PassengerId'],axis=1)


In [299]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [371]:
nump=Pipeline([
    ('inputer',SimpleImputer(strategy='median')),
    ('standardize',StandardScaler())
])

catp=Pipeline([
    ('inputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(drop='first',handle_unknown='infrequent_if_exist',min_frequency=1))
])

def ret_rel(a,b):
    return ['Relatives']
def ret_age(a,b):
    return ['Agebucket']

sum=make_pipeline(SimpleImputer(strategy='median'),FunctionTransformer(lambda df: pd.DataFrame(df.sum(axis=1)),feature_names_out=ret_rel),StandardScaler())
ages=make_pipeline(SimpleImputer(strategy='median'),FunctionTransformer(lambda df:df//15*15,feature_names_out=ret_age),StandardScaler())
first_proc=ColumnTransformer([
    # ('Relatives',sum,['SibSp','Parch']),
    # ('Agebucket',ages,['Age']),
    ('embarked',catp,["Pclass", "Sex", "Embarked"]),
    ('tail',nump,["Age", "SibSp", "Parch", "Fare"])
],remainder='drop')

first_trans=first_proc.fit_transform(df)
pd.DataFrame(first_trans,columns=first_proc.get_feature_names_out())

Unnamed: 0,embarked__Pclass_2,embarked__Pclass_3,embarked__Sex_male,embarked__Embarked_Q,embarked__Embarked_S,tail__Age,tail__SibSp,tail__Parch,tail__Fare
0,0.0,1.0,1.0,0.0,1.0,-0.565736,0.432793,-0.473674,-0.502445
1,0.0,0.0,0.0,0.0,0.0,0.663861,0.432793,-0.473674,0.786845
2,0.0,1.0,0.0,0.0,1.0,-0.258337,-0.474545,-0.473674,-0.488854
3,0.0,0.0,0.0,0.0,1.0,0.433312,0.432793,-0.473674,0.420730
4,0.0,1.0,1.0,0.0,1.0,0.433312,-0.474545,-0.473674,-0.486337
...,...,...,...,...,...,...,...,...,...
886,1.0,0.0,1.0,0.0,1.0,-0.181487,-0.474545,-0.473674,-0.386671
887,0.0,0.0,0.0,0.0,1.0,-0.796286,-0.474545,-0.473674,-0.044381
888,0.0,1.0,0.0,0.0,1.0,-0.104637,0.432793,2.008933,-0.176263
889,0.0,0.0,1.0,0.0,0.0,-0.258337,-0.474545,-0.473674,-0.044381


In [372]:
from sklearn.svm import SVC
for x in [RandomForestClassifier(random_state=42),SVC(random_state=42)]:
    poison=make_pipeline(first_proc,x)
    poison.fit(dfx,dfy)
    pred=poison.predict(dfx)
    print(x)
    print(accuracy_score(dfy,pred))
    print(cross_val_score(poison,dfx,dfy,cv=10).mean())

RandomForestClassifier(random_state=42)
0.9797979797979798
0.8070661672908864
SVC(random_state=42)
0.8372615039281706
0.823820224719101


In [373]:
from sklearn.model_selection import GridSearchCV

params={'svc__C':[0.8,0.9,1,1.4,1.5,1.6,2,2.5,3],'svc__random_state':[42]}

bestmodel=GridSearchCV(poison,param_grid=params,cv=10,scoring='accuracy')
bestmodel.fit(dfx,dfy)
print(bestmodel.best_params_)
bestmodel=bestmodel.best_estimator_


{'svc__C': 2, 'svc__random_state': 42}


In [374]:
print(cross_val_score(bestmodel,dfx,dfy,cv=10,scoring='accuracy').mean())
pred=bestmodel.predict(dfx)
print(accuracy_score(dfy,pred))


0.8249438202247191
0.8428731762065096


In [375]:
test=pd.read_csv('/mnt/All/Downloads/Datasets/titanic/test.csv')

In [376]:
predt=poison.predict(test)
answer=pd.DataFrame({'Survived':predt},index=test['PassengerId'])
answer

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [377]:
answer.to_csv('/mnt/All/Downloads/Datasets/titanic/answer.csv')