In [1]:
from __future__ import print_function, division
import time, os
import numpy as np
import matplotlib.pyplot as plt
import sys
import pandas
import math
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# load cleaned data
to_date = lambda x: pandas.to_datetime(x).value/1000000000

test = pandas.read_csv('test_snapshots.csv', converters={'Event DateTime':to_date})
train = pandas.read_csv('train_snapshots.csv', converters={'Event DateTime':to_date})

In [3]:
train_t = train.drop('Event Type Description',1)
train_t = train_t.drop('Unnamed: 0',1)
train_t = train_t[train_t['state'].isin([0,1,2])]
# train_01 = train_t[train_t['state'].isin([0,1])]
# train_02 = train_t[train_t['state'].isin([0,2])]
# train_12 = train_t[train_t['state'].isin([1,2])]

In [4]:
test_t = test.drop('Event Type Description',1)
test_t = test_t.drop('Unnamed: 0',1)
test_t = test_t[test_t['state'].isin([0,1,2])]

In [5]:
x_cols = ['Veh Ref ID',
 'Acc Pedal Position',
 'Ambient Air Temp',
 'Barometric Press',
 'Brake Switch',
 'Bus Utilization',
 'Cat Intake Gas Temp',
 'Cat Outlet Gas Temp',
 'Clutch Switch',
 'Cmd Eng Fuel Press',
 'Cruise Status',
 'Dpf Regen Inhibit Sw',
 'Dpf Thermal Mngmnt',
 'Drvr Demand Torque',
 'Eng Air Flow Rate',
 'Eng Avg Fuel Econ',
 'Eng Coolant Level',
 'Eng Coolant Temp',
 'Eng Demand Torque',
 'Eng DPF Intake Press',
 'Eng Egr Valve Pos',
 'Eng Exhaust Gas Temp',
 'Eng Fuel Del Press',
 'EngFuelTemp1',
 'Engine Speed',
 'Eng Man Abs Pressure',
 'Eng Oil Pressure',
 'EngInjRail1Press',
 'EngIntakeMan1Temp',
 'EngOilTemp1',
 'Eng Percent Torque',
 'EngTurbo1Boost',
 'EngTurbo1Pos',
 'EngTurbo1Speed',
 'Event - All Lamps On Time Hr',
 'Event - Amber Lamp Time Hr',
 'Event - Mil Lamp Time Hr',
 'Event - Red Lamp Time Hr',
 'Exhaust Tank Level',
 'Exhaust Tank Temp',
 'Fan Speed',
 'Keyswitch Bat Pot',
 'Part Trap Diff Press',
 'Part Trap Out Temp',
 'Scr Intake Gas Temp',
 'Scr Outlet Gas Temp',
 'Vehicle Speed',
 'Population',
 'DTCID',
 'Trip Distance',
 'Trip Idle Time',
 'Trip Run Time',
 'Altitude',
 'Engine Start Ambient',
 'Engine Start Coolant',
 'Latitude',
 'Longitude',
 'Lifetime Idle Hours',
 'Lifetime Idle Fuel',
 'Lifetime Fuel',
 'Lifetime Distance',
 'Lifetime Engine Hours']

In [55]:
X_0 = np.asarray(train_0[x_cols])
X_1 = np.asarray(train_1[x_cols])
X_2 = np.asarray(train_2[x_cols])

Y_0 = np.asarray(train_0[['state']])
Y_1 = np.asarray(train_1[['state']])
Y_2 = np.asarray(train_2[['state']])

In [63]:
reg_01 = linear_model.Lasso(alpha = 0.1)
reg_01.fit(X_01, Y_01)

reg_02 = linear_model.Lasso(alpha = 0.1)
reg_02.fit(X_02, Y_02)

reg_12 = linear_model.Lasso(alpha = 0.1)
reg_12.fit(X_12, Y_12)

error_01 = cross_val_score(reg_01, X_test, Y_test, scoring='explained_variance')
error_02 = cross_val_score(reg_02, X_test, Y_test, scoring='explained_variance')
error_12 = cross_val_score(reg_12, X_test, Y_test, scoring='explained_variance')



Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [6]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [7]:
X_dirty = np.asarray(train_t[x_cols])
imp.fit(X_dirty)
X = imp.transform(X_dirty)
Y = np.asarray(train_t[['state']]).reshape((49384,))

In [8]:
X_test = np.asarray(test_t[x_cols])
Y_test = np.asarray(test_t[['state']]).reshape((9264,))

In [17]:
sum(class_svm.predict(X_test) == Y_test)/len(Y_test)

0.48650690846286704

In [38]:
test_t.groupby(['state']).size().reset_index(name='counts')

Unnamed: 0,state,counts
0,0.0,6873
1,1.0,1160
2,2.0,1231


In [26]:
test_t_sampled = test_t[test_t['state'] == 0].sample(1200)

In [37]:
train_t.groupby(['state']).size().reset_index(name='counts')

Unnamed: 0,state,counts
0,0.0,38668
1,1.0,5193
2,2.0,5523


In [39]:
train_t_sampled = train_t[train_t['state'] == 0].sample(5000)

In [40]:
train_t_balanced = pandas.concat([train_t_sampled,train_t[train_t['state'] == 1],train_t[train_t['state'] == 2]])

In [29]:
test_t_balanced = pandas.concat([test_t_sampled,test_t[test_t['state'] == 1],test_t[test_t['state'] == 2]])

In [43]:
X_dirty = np.asarray(train_t_balanced[x_cols])
imp.fit(X_dirty)
X_bal = imp.transform(X_dirty)
Y_bal = np.asarray(train_t_balanced[['state']]).reshape((15716,))

In [33]:
X_bal_test = np.asarray(test_t_balanced[x_cols])
Y_bal_test = np.asarray(test_t_balanced[['state']]).reshape((3591,))

In [44]:
log_reg = linear_model.LogisticRegression(max_iter=1000)
log_reg.fit(X_bal, Y_bal)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
cross_val_score(log_reg, X_bal, Y_bal, scoring='neg_mean_squared_error') 

array([-1.6466883 , -1.7497614 , -1.50057274])

In [46]:
class_svm = linear_model.SGDClassifier(loss="hinge", penalty="l2", alpha=0.001, max_iter=1000)
class_svm.fit(X_bal, Y_bal)

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [47]:
cross_val_score(class_svm, X_bal, Y_bal, scoring='neg_mean_squared_error') 

array([-1.69020805, -0.93682   , -1.59354716])

In [48]:
sum(log_reg.predict(X_bal_test) == Y_bal_test)/len(Y_bal_test)

0.34920634920634919

In [49]:
sum(class_svm.predict(X_bal_test) == Y_bal_test)/len(Y_bal_test)

0.3539404065719855