In [1]:
import pickle

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import linear_model

SEP = ';'

In [2]:
def manipulate_data(df):
    df['weekday'] = df['fecha'].apply(lambda x: x.weekday())
    df['day'] = df['fecha'].apply(lambda x: x.day)
    df['month'] = df['fecha'].apply(lambda x: x.month)
    df['year'] = df['fecha'].apply(lambda x: x.year)
    df['hour-minute'] = df['fecha'].apply(lambda x: x.strftime('%H:%M'))
    return df

In [3]:
def one_hot_column(df, column):
    # Get one hot encoding of column
    one_hot = pd.get_dummies(df[column], prefix=column)
    # Drop column as it is now encoded
    df = df.drop(column, axis=1)
    # Join the encoded df
    df = df.join(one_hot)

    return df

In [4]:
def one_hot(df):
    df = one_hot_column(df, 'day')
    df = one_hot_column(df, 'month')
    df = one_hot_column(df, 'year')
    df = one_hot_column(df, 'hour-minute')
    df = one_hot_column(df, 'weekday')

    return df

In [5]:
historical_data = pd.read_csv('../data/traffic_data.csv', sep=SEP, parse_dates=[0])
historical_data = manipulate_data(historical_data)
historical_data = historical_data.drop(columns='fecha')

In [6]:
historical_data = one_hot(historical_data)

In [7]:
model_by_district = {}
total_score = 0
for distrito, grouped in historical_data.groupby('distrito'):
    y = grouped.pop('carga')
    X = grouped
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    model = linear_model.LinearRegression()
    
    model.fit(X_train, y_train)
    score_test = model.score(X_test, y_test)
    
    model_by_district[distrito] = model    
    filename = 'lr_model_{}.sav'.format(distrito)
    pickle.dump(model, open(filename, 'wb'))
    
    total_score += score_test
    
    print('Distrito {}, score = {}'.format(distrito, score_test))
    
print('Score promedio = {}'.format(total_score / 21))

Distrito 1, score = 0.6747545365434737
Distrito 2, score = 0.7673935406342304
Distrito 3, score = 0.7681768574547326
Distrito 4, score = 0.7437311708238212
Distrito 5, score = 0.7423412145581689
Distrito 6, score = 0.7421780173026371
Distrito 7, score = 0.7143135561290279
Distrito 8, score = 0.7591077410935851
Distrito 9, score = 0.756398186914588
Distrito 10, score = 0.7494589638057106
Distrito 11, score = 0.7652698837869083
Distrito 12, score = 0.7653335417719397
Distrito 13, score = 0.8073217847403038
Distrito 14, score = 0.772659085626749
Distrito 15, score = 0.7751606872030194
Distrito 16, score = 0.7554505544847308
Distrito 17, score = 0.764445362859506
Distrito 18, score = 0.8168045093396136
Distrito 19, score = 0.7845237831501087
Distrito 20, score = 0.7711449146050519
Distrito 21, score = 0.7779369990334675
Score promedio = 0.7606621377076845
