In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

In [47]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [48]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856387 entries, 0 to 856386
Data columns (total 27 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   IntersectionId           856387 non-null  int64  
 1   Latitude                 856387 non-null  float64
 2   Longitude                856387 non-null  float64
 3   EntryStreetName          848239 non-null  object 
 4   ExitStreetName           850100 non-null  object 
 5   EntryHeading             856387 non-null  object 
 6   ExitHeading              856387 non-null  object 
 7   Hour                     856387 non-null  int64  
 8   Weekend                  856387 non-null  int64  
 9   Month                    856387 non-null  int64  
 10  Path                     856387 non-null  object 
 11  TotalTimeStopped_p20     856387 non-null  float64
 12  TotalTimeStopped_p40     856387 non-null  float64
 13  TotalTimeStopped_p50     856387 non-null  float64
 14  Tota

## One-hot encoding

In [49]:
# StreetName, Path 삭제
str_col_onehot = ['City', 'EntryHeading', 'ExitHeading']

train_onehot = train.copy()
test_onehot = test.copy()

In [50]:
X = pd.get_dummies(train_onehot, columns=str_col_onehot)
Y = train[['TotalTimeStopped_p20', 'TotalTimeStopped_p50', 'TotalTimeStopped_p80', 'DistanceToFirstStop_p20', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p80']]
for col in X.columns:
    if X[col].dtype == 'object':
        X.drop(col, axis = 1, inplace = True)
    if col.find('Stop') != -1:
        X.drop(col, axis = 1, inplace = True)

In [51]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856387 entries, 0 to 856386
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   IntersectionId     856387 non-null  int64  
 1   Latitude           856387 non-null  float64
 2   Longitude          856387 non-null  float64
 3   Hour               856387 non-null  int64  
 4   Weekend            856387 non-null  int64  
 5   Month              856387 non-null  int64  
 6   City_Atlanta       856387 non-null  uint8  
 7   City_Boston        856387 non-null  uint8  
 8   City_Chicago       856387 non-null  uint8  
 9   City_Philadelphia  856387 non-null  uint8  
 10  EntryHeading_E     856387 non-null  uint8  
 11  EntryHeading_N     856387 non-null  uint8  
 12  EntryHeading_NE    856387 non-null  uint8  
 13  EntryHeading_NW    856387 non-null  uint8  
 14  EntryHeading_S     856387 non-null  uint8  
 15  EntryHeading_SE    856387 non-null  uint8  
 16  En

In [52]:
X.columns

Index(['IntersectionId', 'Latitude', 'Longitude', 'Hour', 'Weekend', 'Month',
       'City_Atlanta', 'City_Boston', 'City_Chicago', 'City_Philadelphia',
       'EntryHeading_E', 'EntryHeading_N', 'EntryHeading_NE',
       'EntryHeading_NW', 'EntryHeading_S', 'EntryHeading_SE',
       'EntryHeading_SW', 'EntryHeading_W', 'ExitHeading_E', 'ExitHeading_N',
       'ExitHeading_NE', 'ExitHeading_NW', 'ExitHeading_S', 'ExitHeading_SE',
       'ExitHeading_SW', 'ExitHeading_W'],
      dtype='object')

In [53]:
regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

Intercept: 
 [  -406.41720098  -1380.79583328  -2439.86467992  -2168.90963314
  -7925.03865787 -20101.09449459]
Coefficients: 
 [[-1.78877374e-04  4.74749448e+00 -2.77653996e+00  3.38969572e-02
  -8.06420541e-01 -2.49937837e-02  1.39605296e+01  1.00061168e+01
  -3.37986333e+01  9.83198694e+00  1.84918444e-01 -6.19367109e-02
  -1.59479912e-01 -1.75509153e-01 -3.80301635e-02  3.42252857e-01
  -1.84109896e-01  9.18945337e-02 -2.78408228e-01  1.90968875e-02
  -1.54089912e-01  9.90325910e-02  1.43125859e-01  4.79351163e-01
  -8.87892203e-02 -2.19319141e-01]
 [-6.51640079e-04  1.17315549e+01 -1.16363836e+01  1.21156314e-01
  -3.50502175e+00 -4.82601958e-02  1.27333897e+01  6.56920095e+01
  -1.23283409e+02  4.48580097e+01  4.73288378e-01 -3.73904760e-01
  -5.57020599e-01 -4.77456963e-01 -1.01068281e-01  1.23637359e+00
  -4.59942353e-01  2.59730985e-01 -6.45112862e-01  9.32121348e-02
  -2.33965464e-01  5.05219176e-01  4.64639388e-01  9.07812072e-01
  -2.19611355e-01 -8.72193090e-01]
 [-1.18175

In [54]:
predict_X = pd.get_dummies(test_onehot, columns = str_col_onehot)

for col in predict_X.columns:
    if predict_X[col].dtype == 'object':
        predict_X.drop(col, axis = 1, inplace = True)
    if col.find('Stop') != -1:
        predict_X.drop(col, axis = 1, inplace = True)

In [55]:
predict_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1921357 entries, 0 to 1921356
Data columns (total 26 columns):
 #   Column             Dtype  
---  ------             -----  
 0   IntersectionId     int64  
 1   Latitude           float64
 2   Longitude          float64
 3   Hour               int64  
 4   Weekend            int64  
 5   Month              int64  
 6   City_Atlanta       uint8  
 7   City_Boston        uint8  
 8   City_Chicago       uint8  
 9   City_Philadelphia  uint8  
 10  EntryHeading_E     uint8  
 11  EntryHeading_N     uint8  
 12  EntryHeading_NE    uint8  
 13  EntryHeading_NW    uint8  
 14  EntryHeading_S     uint8  
 15  EntryHeading_SE    uint8  
 16  EntryHeading_SW    uint8  
 17  EntryHeading_W     uint8  
 18  ExitHeading_E      uint8  
 19  ExitHeading_N      uint8  
 20  ExitHeading_NE     uint8  
 21  ExitHeading_NW     uint8  
 22  ExitHeading_S      uint8  
 23  ExitHeading_SE     uint8  
 24  ExitHeading_SW     uint8  
 25  ExitHeading_W     

In [56]:
predict_Y = regr.predict(predict_X)
predict_Y

array([[  1.63264433,   8.83705881,  26.80111085,   4.88422133,
         25.91302562,  95.58485098],
       [  2.24145542,  10.07591459,  28.41244029,   7.10230977,
         30.03639902, 106.65391896],
       [  1.67331504,   8.94849116,  27.6654387 ,   5.39308829,
         27.90884488,  99.05839817],
       ...,
       [  0.67028056,   3.26326652,  13.74772737,   3.97198161,
         20.89303493,  66.64644132],
       [  1.03272556,   4.60009899,  15.75991775,   4.74714439,
         23.23641458,  72.05964461],
       [  1.06662252,   4.72125531,  16.05646112,   4.85755643,
         23.63562396,  73.10659386]])

In [57]:
predict_Y.size

11528142

In [58]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1921357 entries, 0 to 1921356
Data columns (total 12 columns):
 #   Column           Dtype  
---  ------           -----  
 0   IntersectionId   int64  
 1   Latitude         float64
 2   Longitude        float64
 3   EntryStreetName  object 
 4   ExitStreetName   object 
 5   EntryHeading     object 
 6   ExitHeading      object 
 7   Hour             int64  
 8   Weekend          int64  
 9   Month            int64  
 10  Path             object 
 11  City             object 
dtypes: float64(2), int64(4), object(6)
memory usage: 175.9+ MB


In [59]:
# predict_Y = pd.DataFrame(predict_Y)
# predict_Y.rename(columns= {0: '_0', 1: '_1', 2: '_2', 3: '_3', 4: '_4', 5: '_5'}, inplace = True)
# a = predict_Y.head().groupby(level=0)
# a.groups
predict_Y = pd.DataFrame(np.ravel(predict_Y))
predict_Y_index = (predict_Y.index // 6).astype('string') + '_' + (predict_Y.index % 6).astype('string')
predict_Y.set_index(predict_Y_index, inplace=True)

In [60]:
submission = predict_Y.loc[:'1920334_5']

In [61]:
submission.to_csv("submission.csv")