In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import BaseEstimator

pd.set_option.display_max_columns= 500



First ML exercise with the Titanic dataset - a grim classification model for a disaster

In [2]:
def standardize_features(dataframe: pd.DataFrame, feature_names: list):
  """standardizes numerical features and returns a dataframe"""
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  df = dataframe
  for column_name in feature_names:
      raw_data = np.array(df[column_name]).reshape(-1, 1)
      standardized = scaler.fit_transform(raw_data)
      ready = pd.DataFrame(standardized, columns=[column_name])
      df = df.drop(column_name, axis=1).join(ready)
  return df


def fill_missing_values(dataframe: pd.DataFrame, strategy_columns: dict, fill_value='unknown'):
  """Fills missing values in dataframes columns using Sklearn imputer. 
  Params = DataFrame, and dict of imputer_strategy(str) and column_names iterable pairs"""
  from sklearn.impute import SimpleImputer
  df = dataframe
  for imputer_strategy, column_names in strategy_columns.items():
    for column_name in column_names:
      imputer = SimpleImputer(strategy=imputer_strategy, fill_value=fill_value)
      incomplete = np.array(df[column_name]).reshape(-1, 1)
      filled = imputer.fit(incomplete).transform(incomplete)
      filled = pd.DataFrame(filled, columns=[column_name])
      df = df.drop(column_name, axis=1).join(filled)
  return df


def cross_validate_models(models, X, y, scoring_criteria="accuracy", cv=10):
  from sklearn.model_selection import cross_val_score
  from collections import namedtuple
  results = []
  for model in models:
    if "squared" not in scoring_criteria:
      scores = cross_val_score(model, X, y, scoring=scoring_criteria, cv=cv)
    else:
      scores = np.sqrt(cross_val_score(
          model, X, y, scoring=scoring_criteria, cv=cv))
    mean, std = scores.mean(), scores.std()
    cross_validation = namedtuple(
        "cross_validation", ["model", "mean", "stdev"])
    results.append(cross_validation(model, mean, std))
  return results


def lower_case_columns(df: pd.DataFrame):
  columns = list(df.columns)
  for column in columns:
    df.rename(columns={column: column.lower()}, inplace=True)
  return df


def single_score(model, X_train, Y_Train, X_test, Y_Test):
  fitted = model.fit(X_train, Y_Train)
  score = fitted.score(X_test, Y_Test)
  return score

def kaggle_predictions_csv(X_test, predictions: np.array, feature_name: str, output="my_predictions"):
  """X is pd.dataframe.column (series) for Kagle challenges"""
  pred_feature = pd.DataFrame(predictions, columns=[feature_name])
  result_df = pd.DataFrame(X_test).join(pred_feature)
  my_predictions = result_df.to_csv(f"{output}.csv", index=False)
  from google.colab import files
  files.download(f"{output}.csv")


from sklearn.base import BaseEstimator, TransformerMixin



class TopFeatureSelector(BaseEstimator, TransformerMixin):
  # from book   K is the number of features to keep
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k

    def indices_of_top_k(self, arr, k):
        return np.sort(np.argpartition(np.array(arr), -k)[-k:])

    def fit(self, X, y=None):
        self.feature_indices_ = self.indices_of_top_k(self.feature_importances, self.k)
        return self

    def transform(self, X):
        return X[:, self.feature_indices_]


In [3]:
titanic = pd.read_csv('/content/drive/MyDrive/Python data/csv/Kaggle/TitanicChallenge/train.csv')
testdata = pd.read_csv('/content/drive/MyDrive/Python data/csv/Kaggle/TitanicChallenge/test.csv')
passengerid = pd.read_csv('/content/drive/MyDrive/Python data/csv/Kaggle/TitanicChallenge/test.csv').PassengerId

In [4]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic = lower_case_columns(titanic)
testdata = lower_case_columns(testdata)

In [6]:
titanic.drop(columns=["name","passengerid", "ticket", "cabin"], inplace=True)
testdata.drop(columns=["name","passengerid", "ticket","cabin"], inplace=True)

In [7]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [8]:
testdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    418 non-null    int64  
 1   sex       418 non-null    object 
 2   age       332 non-null    float64
 3   sibsp     418 non-null    int64  
 4   parch     418 non-null    int64  
 5   fare      417 non-null    float64
 6   embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [9]:
fill_params = {"median": ["age", "fare"], "most_frequent": ["embarked"]}
titanic = fill_missing_values(titanic, fill_params)
testdata = fill_missing_values(testdata, fill_params)


In [10]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   sibsp     891 non-null    int64  
 4   parch     891 non-null    int64  
 5   age       891 non-null    float64
 6   fare      891 non-null    float64
 7   embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [11]:
testdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    418 non-null    int64  
 1   sex       418 non-null    object 
 2   sibsp     418 non-null    int64  
 3   parch     418 non-null    int64  
 4   age       418 non-null    float64
 5   fare      418 non-null    float64
 6   embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [12]:
titanic = pd.get_dummies(titanic, '', prefix_sep='', columns=['sex', "embarked", "pclass"],
drop_first=True)                                                                  #pd.get_dummies(df, prefix=['A', 'D'], columns=['A', 'D'])
testdata = pd.get_dummies(testdata, prefix='', prefix_sep='', columns=['sex', "embarked","pclass"],
drop_first=True)

# record was set with drop_first == True, but since RF is not affect by colinarity, trying with False


In [13]:
# Trying one hot encoder instead, similar to the HOML author
 

In [14]:
titanic.parch.value_counts(normalize=True)

see = pd.read_csv('/content/drive/MyDrive/Python data/csv/Kaggle/TitanicChallenge/train.csv')


In [15]:
titanic.head()

Unnamed: 0,survived,sibsp,parch,age,fare,male,Q,S,2,3
0,0,1,0,22.0,7.25,1,0,1,0,1
1,1,1,0,38.0,71.2833,0,0,0,0,0
2,1,0,0,26.0,7.925,0,0,1,0,1
3,1,1,0,35.0,53.1,0,0,1,0,0
4,0,0,0,35.0,8.05,1,0,1,0,1


In [16]:
from sklearn.naive_bayes import MultinomialNB as NB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier  as KNC
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.svm import SVC
from xgboost import XGBClassifier as XGB

model1, model2, model3 =  NB(), LR(penalty="l2",max_iter=1000, solver='lbfgs'), RFC(n_estimators=200, random_state=0, min_samples_leaf=20)
model4 = KNC(n_neighbors=7, weights="distance")
model5= SGD(max_iter=1000,  random_state=42)
model6= SVC(kernel="rbf", degree=3, probability=True)
model7 = XGB()
models = [model1, model2, model3, model4, model5, model6]


In [17]:
X = titanic.drop('survived', axis=1)
Y = titanic.survived

In [18]:
x_train, x_test, y_train, y_test = split(X, Y)

In [19]:
from sklearn.ensemble import VotingClassifier

voter = VotingClassifier(
    estimators=[('log reg', model2), ('Rand forest', model3),("SVC", model6), ('XGBoost', model7)], voting="soft")

Blending

In [26]:
selected_model = model2

selected_model.fit(x_train, y_train)
selected_model.score(x_test, y_test)

0.8116591928251121

PCA analysis

In [21]:
from sklearn.decomposition import PCA

def find_n_components(X_train, variance_perc):
    pca = PCA()
    pca.fit(X_train)
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    return np.argmax(cumsum >= variance_perc) + 1

find_n_components(x_train, 0.95)

2

In [27]:
def drop_dimensions(x_train, preserved_variance):
  pca = PCA(n_components=preserved_variance)    # retain n% of variance , while ignoring the rest of the data
  transformed_data = pca.fit_transform(x_train)
  print(pca.explained_variance_ratio_)
  return transformed_data


In [28]:
X_reduced = drop_dimensions(x_train, 0.95)
X_reduced

[0.94001096 0.05910435]


array([[ 37.44541564,  -2.60763581],
       [-24.35063082,  -0.52097901],
       [ 41.06259435, -12.50474879],
       ...,
       [-25.38192613, -11.51038973],
       [-25.15164859,  -0.49367349],
       [-24.08601889,   2.47391273]])

In [30]:
def pca_frame(x_train):
  pca = PCA()
  pca.fit_transform(x_train)  
  indexes = [f"PCA-{n}" for n in range(1, 10)]
  return pd.DataFrame(pca.components_, columns=x_train.columns, index=indexes)

#pca_frame(x_train)

In [31]:
x_train.head()

Unnamed: 0,sibsp,parch,age,fare,male,Q,S,2,3
201,8,2,28.0,69.55,1,0,1,0,1
274,0,0,28.0,7.75,0,1,0,0,1
385,0,0,18.0,73.5,1,0,1,1,0
778,0,0,28.0,7.7375,1,1,0,0,1
840,0,0,20.0,7.925,1,0,1,0,1


Crossvalidation

In [32]:
#cross_validate_models([model6], X, Y, cv=10)

In [33]:
#best_model_so_far = rnd_search.best_estimator_
voter.fit(X,Y)
predictions = voter.predict(testdata)

In [34]:
kaggle_predictions_csv(passengerid, predictions, feature_name="Survived", output="new_predictions")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=10),
        'min_samples_leaf': randint(low=1, high=30),
        'min_impurity_decrease': [0.0, 0.1],
        'random_state': [42],
    }

model = model3
rnd_search = RandomizedSearchCV(model, param_distributions=param_distribs,
                                n_iter=100, cv=10, scoring='accuracy')
rnd_search.fit(X, Y)

In [None]:
best_model_so_far = rnd_search.best_estimator_
best_model_so_far
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
best_model_so_far

In [None]:
selected_model = best_model_so_far

selected_model.fit(x_train, y_train)
selected_model.score(x_test, y_test)

# **TITANIC SPACE CHALLENGE**

In [None]:
space = pd.read_csv('/content/drive/MyDrive/Python data/csv/Kaggle/SpaceTitanic challenge/train.csv')
spacetest = pd.read_csv('/content/drive/MyDrive/Python data/csv/Kaggle/SpaceTitanic challenge/train.csv')

In [None]:
X = space.drop("Transported", axis=1)

In [None]:
X.info()