# DATA PRE PROCESSING PART

Importing necessary libraries

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Importing Data 

In [37]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

Merging train and test data in single dataframe for preprocessing

In [38]:
data=train.append(test,ignore_index=True)


In [39]:
data=data.drop(['Name','Ticket','Cabin'], axis=1)

finding null values

In [40]:
null_values=data.isnull().sum() 

dealing with missing values (Survived data should be not touched)

In [41]:
#Some values of Age is missing for which previous value of data is used to fill
data['Age'] = data['Age'].fillna(method='ffill')

In [42]:
#Only 1 value of fare is missing so we fill it by mode
data['Fare'] = data['Fare'].fillna(data['Fare'].mode()[0])

In [43]:
#Only 2 values of Embarked is missing so we fill it by mode
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
data[['Embarked']]=imputer.fit_transform(data[['Embarked']])

Rechecking for null values (ignore survived column)

In [44]:
null_values_rechecked=data.isnull().sum()

Making Matrices from dataset

In [45]:
#Training set data
Train_id=data.iloc[0:891,0]
data_1 = data.drop('Survived', axis=1)
X_train = data_1.iloc[0:891,0:9].values
y_train = data.iloc[0:891,1].values

In [46]:
#Testing set data
Test_id=data.iloc[891:1310,0]
Test_id=Test_id.reset_index(drop=True)
X_test = data_1.iloc[891:1310,0:9].values

Encoding Categorical Data

In [47]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [48]:
#making label encoder
labelencoder = LabelEncoder()

In [49]:
#Training Data
X_train[:,2]= labelencoder.fit_transform(X_train[:,2])
X_train[:,-1]= labelencoder.fit_transform(X_train[:,-1])

In [50]:
#Testing Data
X_test[:,2]= labelencoder.fit_transform(X_test[:,2])
X_test[:,-1]= labelencoder.fit_transform(X_test[:,-1])

In [51]:
# #making Column Transformer
# from sklearn.compose import ColumnTransformer
# #For Sex Column
# ct = ColumnTransformer([('Sex', OneHotEncoder(),[2])],remainder="passthrough")
# X_train=ct.fit_transform(X_train)
# X_test=ct.fit_transform(X_test)
# #For Embarked Column
# ct1 = ColumnTransformer([('Embarked', OneHotEncoder(),[7])],remainder="passthrough")
# X_train=ct1.fit_transform(X_train)
# X_test=ct1.fit_transform(X_test)

Feature Scaling

In [52]:
from sklearn.preprocessing import StandardScaler
sc_X= StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Implementation of Kernel SVM

building our model

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [117]:
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
a=classifier.fit(X_train, y_train)

In [118]:
a.score(X_train,y_train)

1.0

predicting values of test data

In [119]:
y_pred_Forest=classifier.predict(X_test)

Checking accuracy of model through cross validation

In [120]:
from sklearn.model_selection import cross_val_score

In [121]:
accuracies_Forest=cross_val_score(estimator=classifier, X=X_train,y=y_train,cv=15)

In [122]:
accuracies_Forest.mean()

0.821713747645951

In [99]:
accuracies_Forest.std()

0.05133371456728888

Choosing right hyperparameter through GridSearch

In [100]:
# from sklearn.model_selection import GridSearchCV
# parameters=[{'C':[1,2,10,100,1000],'kernel':['rbf'],'degree':[1,2,3,4]},
#            {'C':[1,2,10,100,1000],'kernel':['poly'],'degree':[1,2,3,4]}]
           
# grid_search = GridSearchCV(estimator=classifier,param_grid=parameters,scoring='accuracy',cv=15)
# grid_search=grid_search.fit(X_train,y_train)
# best_accuracy_SVM = grid_search.best_score_
# best_parameters_SVM = grid_search.best_params_

In [101]:
# best_accuracy_SVM

In [102]:
# best_parameters_SVM

# Backward Elimination to improve model

In [103]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sbs = SFS(RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0), 
          k_features=8, 
          forward=False, 
          floating=False,
          scoring='accuracy',
          cv=15)
sbs.fit(X_train, y_train)
sbs.k_feature_names_

('0', '1', '2', '3', '4', '5', '6', '7')

# Storing Result in csv file

In [104]:
result=pd.DataFrame(y_pred_Forest,columns=['Survived'])

In [105]:
 result=result.join(Test_id)

In [106]:
columns_titles = ["PassengerId","Survived"]
result=result.reindex(columns=columns_titles,copy=False)

In [107]:
result=result.astype(int)

In [108]:
result.to_csv('resultsForest.csv',index=False)