In [10]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import Imputer
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.metrics import silhouette_samples, silhouette_score

###############################################################################
# Data Load
###############################################################################
path = os.path.abspath(os.getcwd())
with open(os.path.normpath(os.path.join(os.path.dirname(path),'lin.csv'))) as in_data:
    skid_data = pd.DataFrame.from_csv(in_data, sep=',')

print (list(enumerate(skid_data.columns[1:],start = 1)))

#Loading into the numpy array
as_array = np.asfarray(skid_data[['Average Velocity (mph)','Max Velocity', 'Velocity Stdev','Average Acceleration (mph per s)', 'Max Acceleration (mph per s)', ' Acceleration Stdev','Displacement','Total Distance Traveled','Max Direction Change per sec', ' Direction Stdev','Time (s)', 'Turns', 'Aggressive Turns', 'Stops', 'Large Deceleration Events', 'Deceleration Events', 'Max Deceleration Event']])


# preprocessing tricks
imputer = Imputer(missing_values="NaN", strategy="mean")
patched = imputer.fit_transform(as_array)

###############################################################################
# K-Means
###############################################################################

n_clusters=3 
cluster = KMeans(n_clusters=n_clusters, n_init = 100)
cluster.fit_predict(patched)
classified_data = cluster.labels_

###############################################################################################
# Evaluate how good clusters are usng Silhouette Coefficient; lowest score is -1, highest is 1
###############################################################################################

silhouette_avg = silhouette_score(patched, classified_data)
print("For n_clusters =", n_clusters,
      "The average silhouette_score is :", silhouette_avg)

###############################################################################
# Ordinary Least Squares Report
###############################################################################

model = sm.OLS(classified_data, patched)
results = model.fit()
print (results.summary())

[(1, 'Average Velocity (mph)'), (2, 'Max Velocity'), (3, 'Velocity Stdev'), (4, 'Average Acceleration (mph per s)'), (5, 'Max Acceleration (mph per s)'), (6, ' Acceleration Stdev'), (7, 'Displacement'), (8, 'Total Distance Traveled'), (9, 'Max Direction Change per sec'), (10, ' Direction Stdev'), (11, 'Time (s)'), (12, 'Turns'), (13, 'Aggressive Turns'), (14, 'Stops'), (15, 'Large Deceleration Events'), (16, 'Deceleration Events'), (17, 'Max Deceleration Event')]
('For n_clusters =', 3, 'The average silhouette_score is :', 0.58512569314646201)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.700
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     629.5
Date:                Sun, 10 May 2015   Prob (F-statistic):               0.00
Time:                        00:31:48   Log-Likelihood: