In [230]:
%matplotlib inline

In [231]:
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import sklearn


from sklearn.preprocessing import normalize
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [232]:
import warnings
warnings.filterwarnings('ignore')

In [291]:
DATA_PATH = '../data/'

INPUT_TRAIN = DATA_PATH+'input_train.csv'
OUTPUT_TRAIN = DATA_PATH+'output_train.csv'
INPUT_TEST = DATA_PATH+'input_test.csv'

In [292]:
input_train = pd.read_csv(INPUT_TRAIN,index_col='Id')
output_train = pd.read_csv(OUTPUT_TRAIN,sep=';',index_col='Id')
input_test = pd.read_csv(INPUT_TEST,index_col='Id')

In [293]:
input_train.shape

(19427, 7)

In [294]:
# Select the rows with a canalisation breaks
ID_2014 = output_train[output_train['2014']==1].index.tolist()
ID_2015 = output_train[output_train['2015']==1].index.tolist()

In [238]:
print("Dimension of breaks in 2014: {0}".format(input_train.iloc[ID_2014].shape))
print("Dimension of breaks in 2015: {0}".format(input_train.iloc[ID_2015].shape))

Dimension of breaks in 2014: (53, 7)
Dimension of breaks in 2015: (37, 7)


In [239]:
# Augment data with breaks to counter unbalanced dataset
REPETITIONS = 7
for k in range(0,REPETITIONS):
    input_train = pd.concat([input_train.loc[ID_2014],input_train.loc[ID_2015],input_train])
    output_train = pd.concat([output_train.loc[ID_2014],output_train.loc[ID_2015],output_train])

In [240]:
output_train.shape

(36653, 2)

In [241]:
input_train.shape

(36653, 7)

In [242]:
def preprocess(dataframe,year=2014):
    X = dataframe
    
    # The relevant value is the age of the pipes
    X['Age'] = year - X['YearConstruction']
    X = X.fillna(10000)
    
    # How long has it been since last failure
    X['YearsOldLastFailure'] = year - X['YearLastFailureObserved']

    # Categorical data
    X = pd.concat([X,pd.get_dummies(X['Feature1'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature2'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature4'])],axis=1)
    
    X = X.drop(["YearConstruction","YearLastFailureObserved","Feature1","Feature2","Feature4"],axis=1)
   
    X['Feature3'] = normalize(X['Feature3']).tolist()[0]
    X['Length'] = normalize(X['Length']).tolist()[0]
    X['Age'] = normalize(X['Age']).tolist()[0]
    X['YearsOldLastFailure'] = normalize(X['YearsOldLastFailure']).tolist()[0]
     
    return X

In [171]:
train = preprocess(input_train)
train.head()

Unnamed: 0_level_0,Feature3,Length,Age,YearsOldLastFailure,P,T,IAB,O,U,C,D,Dr,M
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,-0.001506,0.000494,0.003792,-0.007229,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.015677,0.000557,0.014292,-0.007229,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-0.003116,0.000545,0.003208,-0.007229,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,-0.003653,0.000256,0.004083,-0.007229,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.010307,0.00821,0.003792,-0.007229,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [243]:
def preprocess_output(dataframe,year=2014):
    '''
    Selects the right colum for the year studied
    '''
    return dataframe[str(year)]

In [244]:
YEAR = 2014

In [255]:
X = preprocess(input_train, year = YEAR)
Y = preprocess_output(output_train, year = YEAR)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train, y_train)



y_pred = logreg.predict(X_test)

print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.92      0.85      0.88      7757
          1       0.69      0.83      0.75      3239

avg / total       0.85      0.84      0.84     10996



In [256]:
YEAR = 2015
X = preprocess(input_train,year=YEAR)
Y = preprocess_output(output_train,year=YEAR)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

logreg_2015 = LogisticRegression(class_weight='balanced')
logreg_2015.fit(X_train, y_train)

y_pred = logreg_2015.predict(X_test)

print(classification_report(y_test,y_pred))



             precision    recall  f1-score   support

          0       0.92      0.85      0.88      7757
          1       0.69      0.83      0.75      3239

avg / total       0.85      0.84      0.84     10996



# With adaboost

In [275]:
 from sklearn.ensemble import AdaBoostClassifier

In [281]:
YEAR = 2015
X = preprocess(input_train,year=YEAR)
Y = preprocess_output(output_train,year=YEAR)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

rdm_2015 = AdaBoostClassifier(n_estimators=)
rdm_2015.fit(X_train, y_train)
y_pred = rdm_2015.predict(X_test)

print(classification_report(y_test,y_pred))


YEAR = 2014
X = preprocess(input_train,year=YEAR)
Y = preprocess_output(output_train,year=YEAR)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
rdm_2015.fit(X_train, y_train)

y_pred = rdm_2015.predict(X_test)

print(classification_report(y_test,y_pred))



             precision    recall  f1-score   support

          0       0.91      0.97      0.94     10344
          1       0.90      0.77      0.83      4318

avg / total       0.91      0.91      0.91     14662

             precision    recall  f1-score   support

          0       0.91      0.92      0.91      9555
          1       0.84      0.83      0.83      5107

avg / total       0.88      0.89      0.89     14662



In [297]:
# Predictions for the submission data
sub_1 = rdm_2015.predict_proba(preprocess(input_test,year=2014))
sub_2 = rdm_2015.predict_proba(preprocess(input_test,year=2015))

In [298]:
# Submission formating
submission = pd.DataFrame()
submission['Id'] = input_test.index.tolist()
submission['2014'] = sub_1[:,1]
submission['2015'] = sub_2[:,1]
submission = submission.set_index('Id')
submission.to_csv('../submissions/data_augmentation_ada.csv',index=True,sep=';')