In [1]:
DATASETS_PATH ="/datasets/"

In [18]:
import os
import tarfile
import urllib
import sklearn.model_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from zlib import crc32
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

In [2]:
#DOWNLOAD_ROOT
TITANIC_PATH = os.path.join("datasets","titanic")
def load_titanic_data(titanic_path = TITANIC_PATH):
    csv_path = os.path.join(titanic_path, "train.csv")
    return pd.read_csv(csv_path)
def load_titanic_test(titanic_path = TITANIC_PATH):
    csv_path = os.path.join(titanic_path, "test.csv")
    return pd.read_csv(csv_path)

titanic_data = load_titanic_data()
titanic_labels = titanic_data["Survived"].copy()
titanic_data = titanic_data.drop('Ticket',axis = 1)
titanic_data = titanic_data.drop('Name',axis = 1)
titanic_data = titanic_data.drop('Survived',axis = 1)
titanic_data = titanic_data.drop('Cabin',axis = 1)
titanic_numerical = titanic_data.drop([ "Sex", "Embarked"],axis = 1)
titanic_data.head()
titanic_test = load_titanic_test()
titanic_test = titanic_test.drop('Ticket',axis = 1)
titanic_test = titanic_test.drop('Name',axis = 1)
titanic_test = titanic_test.drop('Cabin',axis = 1)

In [3]:
imputer = SimpleImputer(strategy= "median")
cat_attribs = [ "Sex", "Embarked"]
titanic_categorical = titanic_data[cat_attribs]
cat_encoder = OneHotEncoder()
titanic_categorical_1hot = cat_encoder.fit_transform(titanic_categorical)
titanic_categorical_1hot.shape

(891, 6)

In [4]:
num_attribs = list(titanic_numerical)
cat_attribs = [ "Sex", "Embarked"]
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Sex          891 non-null    object 
 3   Age          714 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
 7   Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [5]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("std_scaler", StandardScaler()),
])
full_pipeline = ColumnTransformer([
    ("num", numerical_pipeline, num_attribs),
    ("cat",OneHotEncoder(sparse = False), cat_attribs),
])


In [6]:

titanic_prepared_data = full_pipeline.fit_transform(titanic_data.copy())
titanic_prepared_data.shape


(891, 12)

In [19]:
Kneighbors= KNeighborsClassifier()
Kneighbors.fit(titanic_prepared_data,titanic_labels)
scores = cross_val_score(Kneighbors, titanic_prepared_data, titanic_labels, scoring = "f1", cv = 2)
svc = SVC(gamma ='auto')
svc.fit(titanic_prepared_data,titanic_labels)
svc_scores = cross_val_score(svc, titanic_prepared_data, titanic_labels, scoring = "f1", cv = 2)
print(scores)
print(svc_scores)

[0.66463415 0.6918239 ]
[0.76300578 0.73202614]


In [24]:
param_grid =[
    {'n_neighbors': [3,5,8, 12], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'kd_tree']}
]
param_grid_SVC = [
    {'kernel': ['linear', 'poly', 'rbf'],'gamma' : ['scale', 'auto']}
]
grid_svc = SVC()
grid_search = GridSearchCV(grid_svc, param_grid_SVC, cv = 2, scoring = "f1", return_train_score= True)
grid_search.fit(titanic_prepared_data,titanic_labels)
final_model_svc= grid_search.best_estimator_

In [26]:
titanic_prepared_test = full_pipeline.transform(titanic_test.copy())
svc_preds = final_model_svc.predict(titanic_prepared_test)
print(svc_preds)


[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [189]:
pd.DataFrame((np.array(preds))).to_csv("trial.csv")

In [27]:
titanic_test = load_titanic_test()
ids = titanic_test["PassengerId"]
survived_preds =  pd.DataFrame({'Survived': svc_preds}, index = ids.index)
w = pd.concat([ids,survived_preds], axis = 1)
w.head(10)
pd.DataFrame((np.array(w))).to_csv("trial.csv", index = False)

In [None]:
pd.DataFrame((np.array(w))).to_csv("trial.csv", index = False)