In [1]:
%matplotlib inline

In [23]:
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import sklearn


from sklearn.preprocessing import normalize
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.cross_validation import StratifiedKFold

import utils

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
DATA_PATH = '../data/'

INPUT_TRAIN = DATA_PATH+'input_train.csv'
OUTPUT_TRAIN = DATA_PATH+'output_train.csv'
INPUT_SUBMISSION = DATA_PATH+'input_test.csv'

In [5]:
input_raw = pd.read_csv(INPUT_TRAIN,index_col='Id')
output_raw = pd.read_csv(OUTPUT_TRAIN,sep=';',index_col='Id')
input_submission = pd.read_csv(INPUT_SUBMISSION ,index_col='Id')

In [6]:
input_raw.shape

(19427, 7)

 We need to preprocess before splitting into test and train data because get_dummies will only take into account existing categories, thus there are less columns in the test set if we preprocess after splitting.

In [7]:
def preprocess(dataframe,year=2014, more_features = False):
    X = dataframe
    
    # The relevant value is the age of the pipes
    X['Age'] = year - X['YearConstruction']
    X = X.fillna(10000)
    
    # How long has it been since last failure
    X['YearsOldLastFailure'] = year - X['YearLastFailureObserved']

    # Categorical data
    X = pd.concat([X,pd.get_dummies(X['Feature1'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature2'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature4'])],axis=1)
    
    X = X.drop(["YearConstruction","YearLastFailureObserved","Feature1","Feature2","Feature4"],axis=1)
   
    X['Feature3'] = normalize(X['Feature3']).tolist()[0]
    X['Length'] = normalize(X['Length']).tolist()[0]
    X['Age'] = normalize(X['Age']).tolist()[0]
    X['YearsOldLastFailure'] = normalize(X['YearsOldLastFailure']).tolist()[0]
    
    if more_features:
        col = X.columns[4:]
        for c in col:
            for u in col:
                X[c+u] = X[c]*X[u]
    return X

In [8]:
input_preprocessed = preprocess(input_raw,year = 2015)

# Split train and test

### With K-fold (allows to keep right proportions within classes)

In [9]:
output_bool = (output_raw['2014'] + output_raw['2015'])>0

In [20]:
skf = StratifiedKFold(output_bool,n_folds=2, shuffle=True)

In [11]:
for k, [train_index, test_index] in enumerate(skf):
   print("TRAIN:", train_index+1, "TEST:", test_index+1)
   test_index = test_index+1
   train_index = train_index+1
   input_train, input_test = input_preprocessed.loc[train_index], input_preprocessed.loc[test_index]
   output_train, output_test = output_raw.loc[train_index], output_raw.loc[test_index]

('TRAIN:', array([    2,     3,     5, ..., 19425, 19426, 19427]), 'TEST:', array([    1,     4,     6, ..., 19418, 19420, 19421]))
('TRAIN:', array([    1,     4,     6, ..., 19418, 19420, 19421]), 'TEST:', array([    2,     3,     5, ..., 19425, 19426, 19427]))


In [12]:
print "Repartition train: "
print "2015: ", output_train[output_train['2015'] == 1].shape[0]
print "2014: ", output_train[output_train['2014'] == 1].shape[0]
print "Not Broken: ", output_train[~((output_train['2014'] == 1) | (output_train['2015'] == 1))].shape[0]

print "Repartition test: "
print "2015: ", output_test[output_test['2015'] == 1].shape[0]
print "2014: ", output_test[output_test['2014'] == 1].shape[0]
print "Not Broken: ", output_test[~((output_test['2014'] == 1) | (output_test['2015'] == 1))].shape[0]

Repartition train: 
2015:  19
2014:  27
Not Broken:  9670
Repartition test: 
2015:  18
2014:  26
Not Broken:  9670


### With homemade technic

In [13]:
perc = 0.4
perc2 = 10

In [12]:
test_ids_2014 = [ID_2014[w] for w in np.random.randint(0,high=len(ID_2014),size=int(perc*len(ID_2014)))] + np.random.randint(0,high=input_train.shape[0],size=int(perc2*len(ID_2014))).tolist()
test_ids_2015 = [ID_2015[w] for w in np.random.randint(0,high=len(ID_2015),size=int(perc*len(ID_2015)))] + np.random.randint(0,high=input_train.shape[0],size=int(perc2*len(ID_2015))).tolist()

In [13]:
input_test = pd.concat([input_train.loc[test_ids_2014],input_train.loc[test_ids_2015]])
output_test = pd.concat([output_train.loc[test_ids_2014],output_train.loc[test_ids_2015]])

print "Repartition: "
print "2015: ", output_test[output_test['2015'] == 1].shape[0]
print "2014: ", output_test[output_test['2014'] == 1].shape[0]
print "Not Broken: ", output_test[~((output_test['2014'] == 1) | (output_test['2015'] == 1))].shape[0]

Repartition: 
2015:  17
2014:  23
Not Broken:  897


In [14]:
ID_2014_train = [w for w in ID_2014 if w not in test_ids_2014]
ID_2015_train = [w for w in ID_2015 if w not in test_ids_2015]
ID_train = [w for w in output_train.index if w not in (test_ids_2014 + test_ids_2015)]

# Data augmentation

### With simple duplication

In [16]:
# Select the rows with a canalisation breaks
ID_2014 = output_train[output_train['2014']==1].index.tolist()
ID_2015 = output_train[output_train['2015']==1].index.tolist()
ID_train = output_train[(output_train['2015']!=1) & (output_train['2014']!=1)].index.tolist()

In [17]:
# Augment data with breaks to counter unbalanced dataset only for training
input_train_duplicate = input_train
output_train_duplicate = output_train
REPETITIONS = 6
for k in range(0,REPETITIONS):
    input_train_duplicate = pd.concat([input_train_duplicate.loc[ID_2014],input_train_duplicate.loc[ID_2015],input_train_duplicate])
    output_train_duplicate = pd.concat([output_train_duplicate.loc[ID_2014],output_train_duplicate.loc[ID_2015],output_train_duplicate])

In [18]:
print "Repartition train: "
print "2015: ", output_train_duplicate[output_train_duplicate['2015'] == 1].shape[0]
print "2014: ", output_train_duplicate[output_train_duplicate['2014'] == 1].shape[0]
print "Not Broken: ", output_train_duplicate[(output_train_duplicate['2015']!=1) & (output_train_duplicate['2014']!=1)].shape[0]

Repartition train: 
2015:  2546
2014:  3058
Not Broken:  9670


In [17]:
def preprocess_output(dataframe,year=2014):
    '''
    Selects the right colum for the year studied
    '''
    return dataframe[str(year)]

In [18]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

logreg_1 = LogisticRegression(class_weight='balanced')
logreg_1.fit(X_train, Y_train)

y_pred = logreg_1.predict(X_test)

print(classification_report(Y_test,y_pred))

YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test_2 = preprocess_output(output_test, year = YEAR)

logreg_2 = LogisticRegression(class_weight='balanced')
logreg_2.fit(X_train, Y_train)

y_pred_2 = logreg_2.predict(X_test)

print(classification_report(Y_test_2,y_pred_2))

             precision    recall  f1-score   support

          0       1.00      0.75      0.86       912
          1       0.08      0.87      0.15        23

avg / total       0.97      0.76      0.84       935

             precision    recall  f1-score   support

          0       0.99      0.79      0.88       918
          1       0.06      0.71      0.11        17

avg / total       0.98      0.79      0.86       935



In [None]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

print(score_function(pred,true))
print('Votre score est de : 0.76149277963129')

# SVM

In [None]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

logreg_1 = SVC()
logreg_1.fit(X_train, Y_train)

y_pred = logreg_1.predict(X_test)

print(classification_report(Y_test,y_pred))

YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test_2 = preprocess_output(output_test, year = YEAR)

logreg_2 = SVC()
logreg_2.fit(X_train, Y_train)

y_pred_2 = logreg_2.predict(X_test)

print(classification_report(Y_test_2,y_pred_2))

In [None]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

print(score_function(pred,true))
print('Votre score est de : 0.76149277963129')

In [None]:
# Predictions for the submission data
#sub_1 = logreg_1.predict(preprocess(input_submission,year=2015))
#sub_2 = logreg_2.predict(preprocess(input_submission,year=2015))

In [None]:
# Submission formating
submission = pd.DataFrame()
submission['Id'] = input_submission.index.tolist()
submission['2014'] = sub_1[:]
submission['2015'] = sub_2[:]
submission = submission.set_index('Id')
submission.to_csv('../submissions/with_augmentation_aftersplit.csv',index=True,sep=';')

# With adaboost

In [None]:
 from sklearn.ensemble import AdaBoostClassifier

In [None]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

rdm_1 = AdaBoostClassifier(n_estimators=100)
rdm_1.fit(X_train, Y_train)

y_pred = rdm_1.predict(X_test)

print(classification_report(Y_test,y_pred))

In [None]:
YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

rdm_2 = AdaBoostClassifier(n_estimators=100)
rdm_2.fit(X_train, Y_train)

y_pred = rdm_2.predict(X_test)

print(classification_report(Y_test,y_pred))

In [None]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

In [None]:
score_function(pred,true)

In [None]:
# Predictions for the submission data
sub_1 = rdm_2015.predict_proba(preprocess(input_test,year=2014))
sub_2 = rdm_2015.predict_proba(preprocess(input_test,year=2015))

In [None]:
# Submission formating
submission = pd.DataFrame()
submission['Id'] = input_test.index.tolist()
submission['2014'] = sub_1[:,1]
submission['2015'] = sub_2[:,1]
submission = submission.set_index('Id')
submission.to_csv('../submissions/data_augmentation_ada.csv',index=True,sep=';')

# Gradient Boosting

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

In [20]:
GBC = GradientBoostingClassifier(n_estimators=10, max_depth=20)

In [21]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

rdm_1 = GBC
rdm_1.fit(X_train, Y_train)

y_pred = rdm_1.predict(X_test)

print(classification_report(Y_test,y_pred))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       909
          1       1.00      0.12      0.21        26

avg / total       0.98      0.98      0.97       935



In [22]:
YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test_2 = preprocess_output(output_test, year = YEAR)

rdm_2 = GBC
rdm_2.fit(X_train, Y_train)

y_pred_2 = rdm_2.predict(X_test)

print(classification_report(Y_test_2,y_pred_2))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       916
          1       0.33      0.05      0.09        19

avg / total       0.97      0.98      0.97       935



In [23]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

score_function(pred,true)

0.8554005722460658

# Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
RFC = RandomForestClassifier(n_estimators=10, max_depth=20)

In [21]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

rdm_1 = RFC
rdm_1.fit(X_train, Y_train)

y_pred = rdm_1.predict(X_test)

print(classification_report(Y_test,y_pred))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       912
          1       1.00      0.04      0.08        23

avg / total       0.98      0.98      0.97       935



In [22]:
YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test_2 = preprocess_output(output_test, year = YEAR)

rdm_2 = RFC
rdm_2.fit(X_train, Y_train)

y_pred_2 = rdm_2.predict(X_test)

print(classification_report(Y_test_2,y_pred_2))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99       918
          1       0.80      0.24      0.36        17

avg / total       0.98      0.99      0.98       935



In [23]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

score_function(pred,true)

0.95013791991895191

In [24]:
# Predictions for the submission data
sub_1 = rdm_1.predict_proba(preprocess(input_submission,year=2014))
sub_2 = rdm_2.predict_proba(preprocess(input_submission,year=2015))

In [25]:
# Submission formating
submission = pd.DataFrame()
submission['Id'] = input_submission.index.tolist()
submission['2014'] = sub_1[:,1]
submission['2015'] = sub_2[:,1]
submission = submission.set_index('Id')
submission.to_csv('../submissions/data_augmentation_rfc.csv',index=True,sep=';')