In [1]:
%matplotlib inline

In [9]:
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import sklearn


from sklearn.preprocessing import normalize
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
DATA_PATH = '../data/'

INPUT_TRAIN = DATA_PATH+'input_train.csv'
OUTPUT_TRAIN = DATA_PATH+'output_train.csv'
INPUT_SUBMISSION = DATA_PATH+'input_test.csv'

In [12]:
input_train = pd.read_csv(INPUT_TRAIN,index_col='Id')
output_train = pd.read_csv(OUTPUT_TRAIN,sep=';',index_col='Id')
input_submission = pd.read_csv(INPUT_SUBMISSION ,index_col='Id')

In [13]:
input_train.shape

(19427, 7)

In [14]:
# Select the rows with a canalisation breaks
ID_2014 = output_train[output_train['2014']==1].index.tolist()
ID_2015 = output_train[output_train['2015']==1].index.tolist()

In [15]:
print("Dimension of breaks in 2014: {0}".format(input_train.iloc[ID_2014].shape))
print("Dimension of breaks in 2015: {0}".format(input_train.iloc[ID_2015].shape))

Dimension of breaks in 2014: (53, 7)
Dimension of breaks in 2015: (37, 7)


 We need to preprocess before splitting into test and train data because get_dummies will only take into account existing categories, thus there are less columns in the test set if we preprocess after splitting.

In [16]:
def preprocess(dataframe,year=2014):
    X = dataframe
    
    # The relevant value is the age of the pipes
    X['Age'] = year - X['YearConstruction']
    X = X.fillna(10000)
    
    # How long has it been since last failure
    X['YearsOldLastFailure'] = year - X['YearLastFailureObserved']

    # Categorical data
    X = pd.concat([X,pd.get_dummies(X['Feature1'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature2'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature4'])],axis=1)
    
    X = X.drop(["YearConstruction","YearLastFailureObserved","Feature1","Feature2","Feature4"],axis=1)
   
    X['Feature3'] = normalize(X['Feature3']).tolist()[0]
    X['Length'] = normalize(X['Length']).tolist()[0]
    X['Age'] = normalize(X['Age']).tolist()[0]
    X['YearsOldLastFailure'] = normalize(X['YearsOldLastFailure']).tolist()[0]
     
    col = X.columns[4:]
    for c in col:
        for u in col:
            X[c+u] = X[c]*X[u]
    return X

In [23]:
input_train = preprocess(input_train,year = 2015)

In [24]:
perc = 0.5

In [25]:
test_ids_2014 = [ID_2014[w] for w in np.random.randint(0,high=len(ID_2014),size=int(perc*len(ID_2014)))] + np.random.randint(0,high=input_train.shape[0],size=int(perc*len(ID_2014))).tolist()
test_ids_2015 = [ID_2015[w] for w in np.random.randint(0,high=len(ID_2015),size=int(perc*len(ID_2015)))] + np.random.randint(0,high=input_train.shape[0],size=int(perc*len(ID_2015))).tolist()

In [26]:
input_test = pd.concat([input_train.loc[test_ids_2014],input_train.loc[test_ids_2015]])
output_test = pd.concat([output_train.loc[test_ids_2014],output_train.loc[test_ids_2015]])

print(input_test.shape,output_test.shape)

((88, 94), (88, 2))


In [27]:
ID_2014_train = [w for w in ID_2014 if w not in test_ids_2014]
ID_2015_train = [w for w in ID_2015 if w not in test_ids_2015]
ID_train = [w for w in output_train.index if w not in (test_ids_2014 + test_ids_2015)]

In [28]:
# Augment data with breaks to counter unbalanced dataset only for training
REPETITIONS = 7
for k in range(0,REPETITIONS):
    input_train = pd.concat([input_train.loc[ID_2014_train],input_train.loc[ID_2015_train],input_train.loc[ID_train]])
    output_train = pd.concat([output_train.loc[ID_2014_train],output_train.loc[ID_2015_train],output_train.loc[ID_train]])

In [29]:
print(input_train.shape, output_train.shape)
print(input_test.shape, output_test.shape)

((25955, 94), (25955, 2))
((88, 94), (88, 2))


In [30]:
def preprocess_output(dataframe,year=2014):
    '''
    Selects the right colum for the year studied
    '''
    return dataframe[str(year)]

In [31]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

logreg_1 = LogisticRegression(class_weight='balanced')
logreg_1.fit(X_train, Y_train)

y_pred = logreg_1.predict(X_test)

print(classification_report(Y_test,y_pred))

YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test_2 = preprocess_output(output_test, year = YEAR)

logreg_2 = LogisticRegression(class_weight='balanced')
logreg_2.fit(X_train, Y_train)

y_pred_2 = logreg_2.predict(X_test)

print(classification_report(Y_test_2,y_pred_2))

             precision    recall  f1-score   support

          0       0.73      0.61      0.67        59
          1       0.41      0.55      0.47        29

avg / total       0.63      0.59      0.60        88

             precision    recall  f1-score   support

          0       0.94      0.74      0.83        69
          1       0.47      0.84      0.60        19

avg / total       0.84      0.76      0.78        88



In [32]:
from public_auc_veolia2 import score_function

In [33]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

print(score_function(pred,true))
print('Votre score est de : 0.76149277963129')

0.62649162229
Votre score est de : 0.76149277963129


# SVM

In [34]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

logreg_1 = SVC()
logreg_1.fit(X_train, Y_train)

y_pred = logreg_1.predict(X_test)

print(classification_report(Y_test,y_pred))

YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test_2 = preprocess_output(output_test, year = YEAR)

logreg_2 = SVC()
logreg_2.fit(X_train, Y_train)

y_pred_2 = logreg_2.predict(X_test)

print(classification_report(Y_test_2,y_pred_2))

             precision    recall  f1-score   support

          0       0.67      1.00      0.80        59
          1       0.00      0.00      0.00        29

avg / total       0.45      0.67      0.54        88

             precision    recall  f1-score   support

          0       0.78      1.00      0.88        69
          1       0.00      0.00      0.00        19

avg / total       0.61      0.78      0.69        88



In [35]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

print(score_function(pred,true))
print('Votre score est de : 0.76149277963129')

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
# Predictions for the submission data
sub_1 = logreg_1.predict(preprocess(input_submission,year=2015))
sub_2 = logreg_2.predict(preprocess(input_submission,year=2015))

In [None]:
# Submission formating
submission = pd.DataFrame()
submission['Id'] = input_submission.index.tolist()
submission['2014'] = sub_1[:]
submission['2015'] = sub_2[:]
submission = submission.set_index('Id')
submission.to_csv('../submissions/with_augmentation_aftersplit.csv',index=True,sep=';')

# With adaboost

In [38]:
 from sklearn.ensemble import AdaBoostClassifier

In [40]:
YEAR = 2014

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

rdm_1 = AdaBoostClassifier(n_estimators=100)
rdm_1.fit(X_train, Y_train)

y_pred = rdm_1.predict(X_test)

print(classification_report(Y_test,y_pred))

             precision    recall  f1-score   support

          0       0.66      0.92      0.77        59
          1       0.17      0.03      0.06        29

avg / total       0.50      0.62      0.53        88



In [41]:
YEAR = 2015

X_train = input_train
Y_train = preprocess_output(output_train, year = YEAR)
X_test = input_test
Y_test = preprocess_output(output_test, year = YEAR)

rdm_2 = AdaBoostClassifier(n_estimators=100)
rdm_2.fit(X_train, Y_train)

y_pred = rdm_1.predict(X_test)

print(classification_report(Y_test,y_pred))

             precision    recall  f1-score   support

          0       0.80      0.96      0.87        69
          1       0.50      0.16      0.24        19

avg / total       0.74      0.78      0.74        88



In [42]:
pred = np.array([y_pred,y_pred_2]).T
true = np.array([Y_test,Y_test_2]).T

In [43]:
score_function(pred,true)

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [44]:
YEAR = 2015
X = preprocess(input_train,year=YEAR)
Y = preprocess_output(output_train,year=YEAR)

X_train, X_test, y_train, y_test_1 = train_test_split(X, Y, test_size=0.4, random_state=0)

rdm_2015 = AdaBoostClassifier(n_estimators=1000)
rdm_2015.fit(X_train, y_train)
y_pred_1 = rdm_2015.predict(X_test)

print(classification_report(y_test_1,y_pred_1))


YEAR = 2014
X = preprocess(input_train,year=YEAR)
Y = preprocess_output(output_train,year=YEAR)

X_train, X_test, y_train, y_test_2 = train_test_split(X, Y, test_size=0.4, random_state=0)
rdm_2015.fit(X_train, y_train)

y_pred_2 = rdm_2015.predict(X_test)

print(classification_report(y_test_2,y_pred_2))



KeyError: 'YearConstruction'

In [76]:
pred = np.array([y_pred_1,y_pred_2])
pred = pred.T
true = np.array([y_test_2,y_test_2])
true = true.T
score_function(pred,true)

0.79625925560382138

In [297]:
# Predictions for the submission data
sub_1 = rdm_2015.predict_proba(preprocess(input_test,year=2014))
sub_2 = rdm_2015.predict_proba(preprocess(input_test,year=2015))

In [298]:
# Submission formating
submission = pd.DataFrame()
submission['Id'] = input_test.index.tolist()
submission['2014'] = sub_1[:,1]
submission['2015'] = sub_2[:,1]
submission = submission.set_index('Id')
submission.to_csv('../submissions/data_augmentation_ada.csv',index=True,sep=';')