#### Titanic dataset is one of the datasets available in sklearn. You are given:
    1. A Training dataset csv file with X train and Y train data
    2. A X test File and you have to predict and submit predictions for this file.

#### Your task is to:
   1. Use Logistic Regression and come with predictions.

### Import Data & Python Packages

In [13]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
training=pd.read_csv('training_titanic_x_y_train.csv').copy()
testing=pd.read_csv('test_titanic_x_test.csv').copy()

### Data Quality & Missing Value Assessment

In [14]:
training.isnull().sum()

Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

In [15]:
training.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


### Assessment of the missing values
#### I'll make the following changes to the data:
           If "Age" is missing for a given row, I'll fill it with the median age.
           If "Embarked" is missing for a riven row, I'll impute with "S" (the most common boarding port).
           I'll ignore "Cabin" as a variable since there are too many missing values 

In [16]:
training["Age"].fillna(training["Age"].median(skipna=True), inplace=True)
training["Embarked"].fillna(training['Embarked'].value_counts().idxmax(), inplace=True)
training.drop('Cabin', axis=1, inplace=True)

In [17]:
training.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,29.0,0,0,A/5 2466,8.05,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,S,0


In [18]:
# check missing values in adjusted train data
training.isnull().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
Survived    0
dtype: int64

### Removing unnecessary variables 
    Name, Fare and ticket are additional variables that dont affect the output

In [19]:
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)
training.drop('Fare', axis=1, inplace=True)

### Adjusting values of training data
    Changing values of 'sex' column with 1 and 0 for male and female respectively
    changing valued of 'Embarked' column with 1,2 and 3 for S Q and C port

In [20]:
training.loc[training['Sex']=='female', 'Sex']=0
training.loc[training['Sex']=='male', 'Sex']=1
training.loc[training['Embarked']=='S', 'Embarked']=1
training.loc[training['Embarked']=='C', 'Embarked']=2
training.loc[training['Embarked']=='Q', 'Embarked']=3

In [21]:
training.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Survived
0,2,0,29.0,1,0,1,1
1,3,1,29.0,0,0,1,0
2,2,1,39.0,0,0,1,0
3,3,0,29.0,0,4,1,0
4,3,1,25.0,0,0,1,0


### Making necessary adjustments in the testing data as well

In [22]:
testing["Age"].fillna(testing["Age"].median(skipna=True), inplace=True)
testing["Embarked"].fillna(testing['Embarked'].value_counts().idxmax(), inplace=True)
testing.drop('Cabin', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Fare', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)
testing.loc[testing['Sex']=='female', 'Sex']=0
testing.loc[testing['Sex']=='male', 'Sex']=1
testing.loc[testing['Embarked']=='S', 'Embarked']=1
testing.loc[testing['Embarked']=='C', 'Embarked']=2
testing.loc[testing['Embarked']=='Q', 'Embarked']=3

In [26]:
testing.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,2,1,8.0,1,1,1
1,1,0,49.0,0,0,1
2,3,1,27.0,0,0,3
3,2,0,24.0,2,1,1
4,1,1,36.0,0,0,1


### Feature scaling

In [27]:
tr= training.values
ts= testing.values
x_train=tr[:, :-1]
y_train=tr[:, -1]
y_train=y_train.astype('int')
x_test= ts
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)
scaler.transform(x_test)

array([[-0.35668092,  0.75126747, -1.69153536,  0.43682178,  0.69411756,
        -0.56819974],
       [-1.56002866, -1.33108386,  1.52488915, -0.48951775, -0.47676762,
        -0.56819974],
       [ 0.84666683,  0.75126747, -0.20099717, -0.48951775, -0.47676762,
         2.68980391],
       ...,
       [-1.56002866, -1.33108386, -0.98549096,  0.43682178, -0.47676762,
         1.06080208],
       [ 0.84666683,  0.75126747,  1.05419288, -0.48951775, -0.47676762,
        -0.56819974],
       [-0.35668092,  0.75126747,  0.54427192, -0.48951775,  1.86500274,
        -0.56819974]])

### Training the model

In [49]:
clf=LogisticRegression(solver='saga')
clf.fit(x_train, y_train)



LogisticRegression(solver='saga')

In [50]:
predicted=clf.predict(x_test)

In [51]:
np.savetxt(X=predicted, delimiter=',', fname='titanic predicted.csv', fmt='%.0f')