In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
import os
print(os.listdir("../input"))


**Importing Library**

In [None]:
train = pd.read_csv("../input/train_V2.csv")

**Printing first 5 row of dataset**

In [None]:
train.head(5)

**Show datasets informations**

In [None]:
train.info()

**Number of rows in the 'train' dataset.**

In [None]:
print(train.count())

**Columns/Features of the dataset**

In [None]:
train.columns

**First we will drop columns Id, groupId and matchId. Because they don't effect the results (winPlacePerc). So at the same time, they don't concern us.**

In [None]:
train.drop(columns=['Id', 'groupId', 'matchId', 'matchType'], inplace = True)

**Visualization of correlation between dataset columns.**

In [None]:
train.corr()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(14, 10)

ax=sns.heatmap(train.corr())

In [None]:
new = train.corr()

In [None]:
print(type(new))

**select last row of correlation matrix. Because it includes correlation between Columns/Feature and target label(WinPlacePer)**

In [None]:
new = new.iloc[-1,:]

In [None]:
type(new)
print(new)

**Creating array which includes column names of datasets **

In [None]:
Col = train.columns.values

**Select columns which have correlation with label smaller than '0.05'**

In [None]:
x = 0
a = []
for i in new:
    if( -0.05 < i < 0.05):
        print(x)
        print(i)
        a.append(Col[x])
    x = x+1

**Lets see ineffective columns in database according to correlation.**

In [None]:
print(a)

**&Drop them all**

In [None]:
train.drop(columns=['killPoints',
 'matchDuration',
 'maxPlace',
 'numGroups',
 'rankPoints',
 'roadKills',
 'teamKills',
 'winPoints'], inplace=True)

**Finding count of Null values.**

In [None]:
train.isnull().sum().sum()

**Finding row indexes of Null values**

In [None]:
null_columns=train.columns[train.isnull().any()]
print(train[train.isnull().any(axis=1)][null_columns].head())

**Looking at row which includes Null value/values**

In [None]:
train.iloc[2744604,:]

**Droping row which includes Null value/values**

In [None]:
train.drop(train.index[2744604], inplace=True)

**In order to decrease number of columns, we combined 3 columns 'rideDistance', 'swimDistance' and 'walkDistance' in a column which name is 'DistanceTraveled'. And at the same time we have done a little 'dimension reduction' operation. **

In [None]:
train['DistanceTraveled'] = train.rideDistance + train.swimDistance + train.walkDistance

**Now, it is time to clear. Therefore, we will drop some columns or rows which are Null or unrelated. **

In [None]:
train.columns

**Now, we drop 3 columns that we don't need anymore.**

In [None]:
train.drop(columns=['rideDistance', 'swimDistance', 'walkDistance'], inplace = True)

**Now we see 'DistanceTraveled' column is on the end of columns. We don't want it there. The y (winPlacePerc) label should be at the end of the columns. Because It eases understand the dataset.  So we will reorder the columns.**

In [None]:
train = train[['assists', 'boosts', 'damageDealt', 'DBNOs', 'DistanceTraveled', 'headshotKills', 'heals',
       'killPlace', 'kills', 'killStreaks', 'longestKill', 'revives',
       'vehicleDestroys', 'weaponsAcquired', 'winPlacePerc'
       ]]

In [None]:
train.columns

**The data preprocessing is nearly finished. From now on, we will work with Machine Learning-Deep Learning models**

**First we will split the data X and y**

In [None]:
X = train.iloc[:,:-1].values
y = train.iloc[:,-1:].values

In [None]:
print(X.shape)
print(y.shape)

**Now we will split again the data to train and test sets with train_test_split function from Sci-kit learn library.**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=42)

In [None]:
print("X_train Shape: ",X_train.shape)
print("X_test Shape: ",X_test.shape)
print("y_train Shape: ",y_train.shape)
print("y_test Shape: ",y_test.shape)

**We have train and test label. It includes possibilities between 0 and 1. But we need two precise class for binary classification. So we will split the label to 0 and 1. If possibility is equal or smaller than 0.5, False(0). If it is greater than 0.5, True(1). **

In [None]:
y_train = (y_train > 0.5)
y_test = (y_test > 0.5)

**Last thing we need to do on data before the creating model is Feature Scaling. Because the features range are so different. And we will reduce those numbers to the same range. At the same time, it will increase the models score and accuracy. If you don't believe you can try yourself :)**

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**Importing Deep Learning Library (Keras)**

In [None]:
from keras.models import Sequential
from keras.layers import Dense

**Define new classifier object**

In [None]:
classifier = Sequential()

In [None]:
# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 14, kernel_initializer = 'normal', activation = 'relu', input_dim = 14))

# Adding the second hidden layer
classifier.add(Dense(units = 7, kernel_initializer = 'normal', activation = 'relu'))

# Adding the third hidden layer
classifier.add(Dense(units = 7, kernel_initializer = 'normal', activation = 'relu'))

# Adding the output layer # It's activation function is sigmoid because we will do binary classification which is between 0 and 1.
classifier.add(Dense(units = 1, kernel_initializer = 'normal', activation = 'sigmoid'))

In [None]:
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

**You can see the number of epochs is low. I tried, the accuracy doesn't change much after 2 epoches. **

In [None]:
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 128, epochs = 2)

**Changing possibilities to 0 and 1.**

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

**Evaluating the model performance.**

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)  
print("accuracy= %"+"%.2f"%(f1*100))

**Evaluating deep learning model with Kfold Cross Validation.**

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
def build_classifier():
    classifier2 = Sequential()
    classifier2.add(Dense(units = 14, kernel_initializer = 'normal', activation = 'relu', input_dim = 14))
    classifier2.add(Dense(units = 7, kernel_initializer = 'normal', activation = 'relu'))
    classifier2.add(Dense(units = 7, kernel_initializer = 'normal', activation = 'relu'))
    classifier2.add(Dense(units = 1, kernel_initializer = 'normal', activation = 'sigmoid'))
    classifier2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier2
classifier2 = KerasClassifier(build_fn = build_classifier, batch_size = 128, epochs = 1)
accuracies = cross_val_score(estimator = classifier2, X = X_train, y = y_train, cv = 4, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

In [None]:
print("Mean accuracy of Kfold: %{}".format(mean*100))
print("Variance: ",variance)

**Chapter of submission**

In [None]:
test = pd.read_csv("../input/test_V2.csv")#importing test data 

**Preprocessing of test data**

In [None]:
Id = test.iloc[:,0:1].values

test.drop(columns=['killPoints',
 'matchDuration',
 'maxPlace',
 'numGroups',
 'rankPoints',
 'roadKills',
 'teamKills',
 'winPoints'], inplace=True)
     
test.drop(columns=['Id', 'groupId', 'matchId', 'matchType'], inplace = True)

test['DistanceTraveled'] = test.rideDistance + test.swimDistance + test.walkDistance
test.drop(columns=['rideDistance', 'swimDistance', 'walkDistance'], inplace = True)

sc2 = StandardScaler()
test = sc2.fit_transform(test)

In [None]:
test_pred = classifier.predict(test)
test_pred = (test_pred > 0.5)

**Printing prediction results**

In [None]:
print(test_pred)

In [None]:
print(Id.shape)
print(test_pred.shape)

**You see 'Id' and 'test_pred' arrays have 2 dimensions but we need only 1 dimension. So we have to reshape them.**

In [None]:
Id = Id.reshape(1934174)
test_pred = test_pred.reshape(1934174)

**Creating submission csv**

In [None]:
submission = pd.DataFrame({'Id':Id,'WinPlacePer':test_pred})

In [None]:
submission.head(5)

**Save submission file as a csv folder**

In [None]:
filename = 'PUBGPredictions1.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)