In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [10]:
# Loading train and test data

train = pd.read_csv("../Dataset/train.csv") 
test = pd.read_csv("../Dataset/test.csv") 

# Balancing Data

train = train[train["TARGET"] == 0].sample(3008).append(train[train["TARGET"] == 1])

# Removing columns with correlation lower than 0.15 to the target variable

train_corr = train.corr(method = 'pearson')["TARGET"]
train_corr = train_corr.fillna(0)

for i in range(len(train_corr)):
    if abs(train_corr[i]) < 0.05:
        train = train.drop(train_corr.index[i],1)
        test = test.drop(train_corr.index[i],1)

In [11]:
# Removing outliers

train = train[(np.abs(stats.zscore(train)) < 3).all(axis=1)]

# Splitting data

Y = train["TARGET"]
X = train.drop("TARGET",1)

# Normalizing Data

scaler = MinMaxScaler(feature_range = (0, 1))
X = pd.DataFrame(scaler.fit_transform(X))
test = pd.DataFrame(scaler.fit_transform(test))


In [9]:
#Training Model - Logistic Regression - 0.70 Accuracy

model = LogisticRegression(max_iter = 500)
model.fit(X, Y.values.ravel())

# Runing the model on test data

result = pd.DataFrame(model.predict(test), columns = ["TARGET"])

result.index.name = "ID"
ID = pd.read_csv("../Dataset/test.csv")
result.index = ID["ID"]
result.to_csv("../Dataset/Kaggle.csv")

In [12]:
#Training Model - XGBoost - 0.64 Accuracy

model = XGBClassifier()
model.fit(X, Y.values.ravel())

# Runing the model on test data

result = pd.DataFrame(model.predict(test), columns = ["TARGET"])

result.index.name = "ID"
ID = pd.read_csv("../Dataset/test.csv")
result.index = ID["ID"]
result.to_csv("../Dataset/Kaggle.csv")

In [None]:
#Training Model - Random Forest - 0.65 Accuracy

num_trees = 200
max_features = 5
model = RandomForestClassifier(n_estimators = num_trees, max_features = max_features)
model.fit(X, Y.values.ravel())

# Runing the model on test data

result = pd.DataFrame(model.predict(test), columns = ["TARGET"])

result.index.name = "ID"
ID = pd.read_csv("../Dataset/test.csv")
result.index = ID["ID"]
result.to_csv("../Dataset/Kaggle.csv")