In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, balanced_accuracy_score

# BMAC = balanced_accuracy_score(y_ture, y_pred)

# Setting up default plotting parameters
plt.rcParams['figure.figsize'] = [20.0, 7.0]
sns.set_palette('viridis')
sns.set_style('white')
sns.set_context('talk', font_scale=0.8)

In [2]:
# Read in Data and clean Data 
train = pd.read_csv('X_train.csv')
train_label = pd.read_csv('y_train.csv')
null_cnt = train.isnull().sum()

mean_col = train.mean()
numcols = train.columns
cols = numcols[numcols != 'id']
train[train==np.Inf] =np.NaN
train[train==np.NINF] = np.NaN
for c in cols: train[c].fillna(mean_col[c],inplace=True)

In [6]:
#train.describe()

Unnamed: 0,id,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x990,x991,x992,x993,x994,x995,x996,x997,x998,x999
count,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,...,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0
mean,2399.5,-0.002648,-0.042025,-0.132972,0.13723,0.513012,-0.683598,0.011357,0.658513,0.533715,...,-0.851262,0.313396,-0.648227,-0.157277,0.742207,-0.689283,0.192959,1.03409,-0.034786,0.381932
std,1385.784976,0.595816,0.601159,0.698546,0.568752,0.547111,0.667047,0.546985,0.522445,0.790867,...,0.610974,0.663424,0.867635,0.701542,0.529753,1.056638,0.760257,0.876001,0.539665,0.81544
min,0.0,-2.918944,-2.785198,-3.835306,-1.97776,-2.400628,-3.515205,-2.549499,-1.83115,-1.622625,...,-3.925078,-1.87314,-4.969579,-2.967889,-1.575657,-4.213227,-3.916493,-1.279107,-1.85599,-2.047746
25%,1199.75,-0.317676,-0.414103,-0.585636,-0.224876,0.186098,-1.104949,-0.304116,0.338293,-0.059464,...,-1.197907,-0.167154,-1.170253,-0.589688,0.389121,-1.508434,-0.176619,0.344176,-0.361358,-0.226206
50%,2399.5,0.043687,-0.064241,-0.058176,0.114235,0.509427,-0.61607,0.035645,0.659884,0.393851,...,-0.818367,0.221534,-0.517181,-0.051797,0.720817,-0.458824,0.2714,0.863167,-0.048194,0.258635
75%,3599.25,0.35624,0.315951,0.361034,0.483126,0.837053,-0.235768,0.364625,0.993737,1.046684,...,-0.475521,0.724663,-0.02647,0.335511,1.060743,0.166137,0.659553,1.65215,0.271405,0.939263
max,4799.0,2.462148,2.790059,2.205821,2.968992,3.793726,1.991323,2.423469,2.958195,3.927408,...,2.045402,3.631006,2.131375,2.965337,2.818347,1.861042,3.163857,4.884627,2.925251,3.563565


In [14]:
len(train_label.loc[train_label.y==1])

3600

In [3]:
# Split Data: training and validation
y = train_label.y
X = train.drop('id',axis=1)
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=7)

## Try Logistic Regression (Using Cross validation with 5 folds)
lr = LogisticRegression(solver='liblinear',multi_class='auto').fit(X_train,y_train)
scores = cross_val_score(lr, X_train, y_train, cv=5)
print('Log Regression Loss:',scores.mean())
y_train_pred = lr.predict(X_train)
print('BMAC of training',balanced_accuracy_score(y_train, y_train_pred))
y_val_pred = lr.predict(X_val)
print('BMAC of validation',balanced_accuracy_score(y_val, y_val_pred))


Log Regression Loss: 0.7627602506564116
BMAC of training 0.9711123550516029
BMAC of validation 0.5987393648450277


In [4]:
## Over sampling (SMOTE)
from imblearn.over_sampling import SMOTE
y = train_label.y
X = train.drop('id',axis=1)
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=7)
sm = SMOTE(sampling_strategy='auto', random_state=20)
X_train , y_train = sm.fit_resample(X_train, y_train)

## Try Logistic Regression (Using Cross validation with 5 folds)
lr = LogisticRegression(solver='liblinear',multi_class='auto')
scores = cross_val_score(lr, X_train, y_train, cv=5)
print('Log Regression Loss:',scores.mean())
lr = LogisticRegression(solver='liblinear',multi_class='auto').fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
print('BMAC of training',balanced_accuracy_score(y_train, y_train_pred))
y_val_pred = lr.predict(X_val)
print('BMAC of validation',balanced_accuracy_score(y_val, y_val_pred))


Using TensorFlow backend.


Log Regression Loss: 0.9353043478260868
BMAC of training 0.992463768115942
BMAC of validation 0.6121981022012658


In [6]:
## XGBoost METHOD
import xgboost as xgb
model_XGBoostClassifier = xgb.XGBClassifier()

y = train_label.y
X = train.drop('id',axis=1)
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=7)
sm = SMOTE(sampling_strategy='auto', random_state=20)
X_train , y_train = sm.fit_resample(X_train, y_train)


lr = model_XGBoostClassifier
scores = cross_val_score(lr, X_train, y_train, cv=5)
print('Log Regression Loss:',scores.mean())
lr = model_XGBoostClassifier.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
print('BMAC of training',balanced_accuracy_score(y_train, y_train_pred))
y_val_pred = lr.predict(X_val)
print('BMAC of validation',balanced_accuracy_score(y_val, y_val_pred))

KeyboardInterrupt: 

In [10]:
test = pd.read_csv('X_test.csv')
test_id = test['id']
X_test = test.drop('id',axis=1)
y_test = lr.predict(X_test)
sol = pd.DataFrame()
sol['id'] = test_id
sol['y'] = y_test
sol.to_csv('prediction.csv',index=False)

  result = method(y)


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().