# Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier



## Read of the data

### Train data

In [2]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


### Test data

In [3]:
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


### Example Submission

In [4]:
df_example_submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
df_example_submission.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


## Random Forest Model

We'll use only 4 columns: 
  - Pclass
  - Sex
  - SibSp
  - Parch
    
To predict the target variable "Survived"

Target variable.

In [5]:
y = df_train['Survived']

Show the columns that we have

In [6]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Show the columns that we'll use to predict.

In [7]:
df_train[["Pclass", "Sex", "SibSp", "Parch"]]

Unnamed: 0,Pclass,Sex,SibSp,Parch
0,3,male,1,0
1,1,female,1,0
2,3,female,0,0
3,1,female,1,0
4,3,male,0,0
...,...,...,...,...
886,2,male,0,0
887,1,female,0,0
888,3,female,1,2
889,1,male,0,0


Create binary columns for the categorical column "Sex"

In [8]:
# If we convert all the categorical variables we need 
# to binary: 
features = ["Pclass", "Sex", "SibSp", "Parch"]
pd.get_dummies(df_train[features])

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male
0,3,1,0,0,1
1,1,1,0,1,0
2,3,0,0,1,0
3,1,1,0,1,0
4,3,0,0,0,1
...,...,...,...,...,...
886,2,0,0,0,1
887,1,0,0,1,0
888,3,1,2,1,0
889,1,0,0,0,1


Select the training features for train and test.

In [9]:
X_train = pd.get_dummies(df_train[features])
y_train = df_train['Survived']

In [10]:
X_test = pd.get_dummies(df_test[features])

In [11]:
print(f"Training : {X_train.shape[0]}" )
print(f"Test : {X_test.shape[0]}" )

Training : 891
Test : 418


Creation of the object model.

In [12]:
random_forest_classifier = RandomForestClassifier(
        n_estimators=100, # number of trees
        max_depth=5,
        random_state=1
)

In [13]:
random_forest_classifier.fit(X_train, y_train)


We can use our model to predict on the test data.

In [14]:
predictions_made = random_forest_classifier.predict(X_test)
predictions_made

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

To submit the exercise we need to write a csv with the columns: 
- Id of the passenger
- If he survived (1) or not (0)

In [15]:
df_test.PassengerId

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

With the passengers id and the predictions, we can create a DataFrame that will be the submission

In [16]:
df_submission_proposal = pd.DataFrame(
    {
        "PassengerId": df_test.PassengerId,
        "Survived": predictions_made
    }
)
df_submission_proposal.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


We have to save the DataFrame with the predictions in a csv. 

In [17]:
df_submission_proposal.to_csv('submission.csv', index=False)