In [1]:
import zipfile
import pandas as pd
import numpy as np
import pickle
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.preprocessing as pre
from sklearn import preprocessing

In [2]:
#creates instance of model with no parameters set - thus setting them all to their defaults
model = LogisticRegression()

In [3]:
#unzips
zip_ref = zipfile.ZipFile('C:/Users/Josep/Python/titanic/inputs/all.zip')
zip_ref.extractall(r"C:/Users/Josep/Python/titanic/inputs/")
zip_ref.close()

In [56]:
df=pd.read_csv('C:/Users/Josep/Python/titanic/inputs/train.csv')

In [57]:
# converts to numerics
df=df.replace(to_replace={'male':'2','female':'1'})
df=df.replace(to_replace={'C':'1','Q':'2','S':'3'})
df.set_index('PassengerId',inplace=True)

In [58]:
df2=df.drop(['Name','Ticket','Cabin'],axis=1)

In [59]:
# converts all columns to float type data

def convert(df,df_cols):
    for col in df_cols:
#         df[col]=pd.to_numeric(df[col])
        df[col]=df[col].astype(float)

cols_to_convert=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Survived']

convert(df2,cols_to_convert)

In [8]:
#replaces missing values with the mean score in their column

def fill_blanks(df,df_cols):
    for col in df_cols:
        df[col]=df[col].fillna(df[col].mean())

cols_to_fill_blanks=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Survived']

fill_blanks(df2,cols_to_fill_blanks)

In [9]:
#normalises values to figures between 0 and 1

scaler=pre.MinMaxScaler()

def scale(df,df_cols):
    for col in df_cols:
        df2[col]=scaler.fit_transform(df[col].values.reshape(-1,1))

cols_to_scale=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Survived']

scale(df2,cols_to_scale)

In [10]:
# quick look at correlations
df2.corr(method='spearman')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.339668,-0.543351,-0.039109,0.088879,0.138266,0.323736,-0.164621
Pclass,-0.339668,1.0,0.135775,-0.308875,-0.043019,-0.022801,-0.688032,0.097063
Sex,-0.543351,0.135775,1.0,0.067809,-0.195204,-0.254512,-0.259593,0.120217
Age,-0.039109,-0.308875,0.067809,1.0,-0.147035,-0.21729,0.118847,-0.029127
SibSp,0.088879,-0.043019,-0.195204,-0.147035,1.0,0.450014,0.447113,0.013582
Parch,0.138266,-0.022801,-0.254512,-0.21729,0.450014,1.0,0.410074,0.030372
Fare,0.323736,-0.688032,-0.259593,0.118847,0.447113,0.410074,1.0,-0.079909
Embarked,-0.164621,0.097063,0.120217,-0.029127,0.013582,0.030372,-0.079909,1.0


In [11]:
# splits df into x (predicting features) and y (the thing to predict)
x=df2.drop(['Survived'],axis=1)
# y=df2.drop(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked'],axis=1)
y=df2['Survived']

In [12]:
# splits into train and test - needs to test on unseen data
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [13]:
# trains the model
attempt=model.fit(x_train, y_train)

In [14]:
# makes the prediction based off of the saved model
result=attempt.predict(x_test)

In [15]:
# converts result to dataframe
# result_df=pd.DataFrame(result)
# result_df.columns=['prediction']

# converts both the array of actual results, and the predicted result, into a dataframe
y_df=pd.DataFrame(y_test)
y_df.columns=['actual_result']

y_df['prediction']=result

In [16]:
# creates boleans to show right vs wrong
successes=y_df['actual_result']==y_df['prediction']

successes.value_counts()

True     171
False     52
dtype: int64

In [18]:
# saves the model
file_name='titanic_predictor.sav'
pickle.dump(attempt,open(file_name,'wb'))
predictor=pickle.load(open(file_name, 'rb'))

## Actual test prediction

In [73]:
# imports the actual test dataframe that gets submitted to kaggle
df=pd.read_csv('C:/Users/Josep/Python/titanic/inputs/test.csv')

In [74]:
# converts to numerics
df=df.replace(to_replace={'male':'2','female':'1'})
df=df.replace(to_replace={'C':'1','Q':'2','S':'3'})
df.set_index('PassengerId',inplace=True)

In [75]:
df2=df.drop(['Name','Ticket','Cabin'],axis=1)

In [76]:
# converts all columns to float type data

def convert(df,df_cols):
    for col in df_cols:
#         df[col]=pd.to_numeric(df[col])
        df[col]=df[col].astype(float)

cols_to_convert=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']

convert(df2,cols_to_convert)

In [77]:
#replaces missing values with the mean score in their column

def fill_blanks(df,df_cols):
    for col in df_cols:
        df[col]=df[col].fillna(df[col].mean())

cols_to_fill_blanks=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']

fill_blanks(df2,cols_to_fill_blanks)

In [78]:
#normalises values to figures between 0 and 1

scaler=pre.MinMaxScaler()

def scale(df,df_cols):
    for col in df_cols:
        df2[col]=scaler.fit_transform(df[col].values.reshape(-1,1))

cols_to_scale=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']

scale(df2,cols_to_scale)

In [79]:
result=attempt.predict(df2)

In [80]:
output=df2
output['Survived']=result
output.drop(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'],axis=1,inplace=True)
output['Survived']=output['Survived'].astype(int)

In [82]:
output.to_csv('C:/Users/Josep/Python/titanic/predictions.csv')