In [1]:
%matplotlib inline

In [2]:
# In[3]:

import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import sklearn


from sklearn.preprocessing import normalize
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report


# In[4]:

import warnings
warnings.filterwarnings('ignore')


# In[5]:

DATA_PATH = '../data/'

INPUT_TRAIN = DATA_PATH+'input_train.csv'
OUTPUT_TRAIN = DATA_PATH+'output_train.csv'
INPUT_SUBMISSION = DATA_PATH+'input_test.csv'


# In[6]:

input_train = pd.read_csv(INPUT_TRAIN,index_col='Id')
output_train = pd.read_csv(OUTPUT_TRAIN,sep=';',index_col='Id')
input_submission = pd.read_csv(INPUT_SUBMISSION ,index_col='Id')


# In[7]:

input_train.shape


# In[8]:

# Select the rows with a canalisation breaks
ID_2014 = output_train[output_train['2014']==1].index.tolist()
ID_2015 = output_train[output_train['2015']==1].index.tolist()


# In[9]:

print("Dimension of breaks in 2014: {0}".format(input_train.iloc[ID_2014].shape))
print("Dimension of breaks in 2015: {0}".format(input_train.iloc[ID_2015].shape))


#  We need to preprocess before splitting into test and train data because get_dummies will only take into account existing categories, thus there are less columns in the test set if we preprocess after splitting.

# In[10]:

def preprocess(dataframe,year=2014):
    X = dataframe
    
    # The relevant value is the age of the pipes
    X['Age'] = year - X['YearConstruction']
    X = X.fillna(10000)
    
    # How long has it been since last failure
    X['YearsOldLastFailure'] = year - X['YearLastFailureObserved']

    # Categorical data
    X = pd.concat([X,pd.get_dummies(X['Feature1'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature2'])],axis=1)
    X = pd.concat([X,pd.get_dummies(X['Feature4'])],axis=1)
    
    X = X.drop(["YearConstruction","YearLastFailureObserved","Feature1","Feature2","Feature4"],axis=1)
   
    X['Feature3'] = normalize(X['Feature3']).tolist()[0]
    X['Length'] = normalize(X['Length']).tolist()[0]
    X['Age'] = normalize(X['Age']).tolist()[0]
    X['YearsOldLastFailure'] = normalize(X['YearsOldLastFailure']).tolist()[0]
     
    col = X.columns[4:]
    for c in col:
        for u in col:
            X[c+u] = X[c]*X[u]
    return X


# In[11]:

input_train = preprocess(input_train,year = 2015)


# In[13]:

perc = 0.5


# In[14]:

test_ids_2014 = [ID_2014[w] for w in np.random.randint(0,high=len(ID_2014),size=int(perc*len(ID_2014)))] + np.random.randint(0,high=input_train.shape[0],size=int(perc*len(ID_2014))).tolist()
test_ids_2015 = [ID_2015[w] for w in np.random.randint(0,high=len(ID_2015),size=int(perc*len(ID_2015)))] + np.random.randint(0,high=input_train.shape[0],size=int(perc*len(ID_2015))).tolist()


# In[15]:

input_test = pd.concat([input_train.loc[test_ids_2014],input_train.loc[test_ids_2015]])
output_test = pd.concat([output_train.loc[test_ids_2014],output_train.loc[test_ids_2015]])

print(input_test.shape,output_test.shape)


# In[16]:

ID_2014_train = [w for w in ID_2014 if w not in test_ids_2014]
ID_2015_train = [w for w in ID_2015 if w not in test_ids_2015]
ID_train = [w for w in output_train.index if w not in (test_ids_2014 + test_ids_2015)]


# In[17]:

# Augment data with breaks to counter unbalanced dataset only for training
REPETITIONS = 7
for k in range(0,REPETITIONS):
    input_train = pd.concat([input_train.loc[ID_2014_train],input_train.loc[ID_2015_train],input_train.loc[ID_train]])
    output_train = pd.concat([output_train.loc[ID_2014_train],output_train.loc[ID_2015_train],output_train.loc[ID_train]])


# In[18]:

print(input_train.shape, output_train.shape)
print(input_test.shape, output_test.shape)


# In[19]:

def preprocess_output(dataframe,year=2014):
    '''
    Selects the right colum for the year studied
    '''
    return dataframe[str(year)]


# In[28]:

Dimension of breaks in 2014: (53, 7)
Dimension of breaks in 2015: (37, 7)
(88, 94) (88, 2)
(32769, 94) (32769, 2)
(88, 94) (88, 2)


In [3]:
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers import Dense, Activation

Using TensorFlow backend.


In [4]:
output_train.shape

(32769, 2)

In [5]:
input_train.shape

(32769, 94)

In [9]:
model = Sequential()
model.add(Dense(100, input_shape=(94,)))
model.add(Dense(1000))
model.add(Dense(1000))

model.add(Dense(2, activation='softmax'))
sgd = SGD(lr = 0.0001,momentum=0.9,decay=0.01)
model.compile(optimizer='rmsprop',
      loss='mse',
      metrics=['accuracy'])

In [10]:
input_train.shape

(32769, 94)

In [None]:
history = model.fit(np.array(input_train),np.array(output_train),validation_data=(np.array(input_test),np.array(output_test)),nb_epoch=100)

Train on 32769 samples, validate on 88 samples
Epoch 1/100