In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# 1. Data Preparation
# Load dataset
df = pd.read_csv('training.csv')
df = df[df['outcome'] == 'pothole']

# Drop specified columns
columns_to_drop = ['Unnamed: 0', 'request_number', 'address', 'ssa', 'avg_traffic', 'days_open']
df = df.drop(columns=columns_to_drop)

# Convert creation_date to month and then to cyclic sin-cos values
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['month'] = df['creation_date'].dt.month
df['month_sin'] = np.sin((df['month']-1) * (2.*np.pi/12))
df['month_cos'] = np.cos((df['month']-1) * (2.*np.pi/12))
df['year'] = df['creation_date'].dt.year

# Create the Year-Month-Community column
df['Year-Month-Community'] = df['year'].astype(str) + '-' + df['month'].astype(str).str.zfill(2) + ' at community ' + df['community_area_number'].astype(str)

# Aggregate data
agg_df = df.groupby(['year', 'month', 'community_area_number'])['n_potholes_on_block'].sum().reset_index()
agg_df.rename(columns={'n_potholes_on_block': 'Total_n_potholes'}, inplace=True)
df = df.drop('n_potholes_on_block', axis=1)
result = pd.merge(df, agg_df, on=['year', 'month', 'community_area_number'], how='left')
result = result.drop_duplicates(subset=['year', 'month', 'community_area_number'])
result = pd.get_dummies(result, columns=['community_area_number'])

# Creating test set 2017-18, with month year and community area number
years = [2017, 2018]
community_area_numbers = list(range(78))
all_combinations = []
for year in years:
    for month in range(1, 13):
        for community in community_area_numbers:
            year_month = f"{year}-{month:02}"
            combination = {
                'Year': year,
                'Month': month,
                'community_area_number': community,
                'Year_Month': year_month
            }
            all_combinations.append(combination)

# Create a DataFrame from the combinations
test_set = pd.DataFrame(all_combinations)
test_set['month_sin'] = np.sin((test_set['Month']-1) * (2.*np.pi/12))
test_set['month_cos'] = np.cos((test_set['Month']-1) * (2.*np.pi/12))
test_set = pd.get_dummies(test_set, columns=['community_area_number'])

# Display the test set
test_set=test_set.drop(columns=['Year','Month','Year_Month'])

# Train-test split based on creation_date
train_data = result[(result['creation_date'] >= '2011-01-01') & (result['creation_date'] <= '2016-12-31')]
#test_data = result[(result['creation_date'] >= '2016-01-01') & (result['creation_date'] <= '2016-12-31')]

# Prepare train and test datasets without filtering for months
X_train = train_data.drop(columns=['Total_n_potholes', "Year-Month-Community", "outcome", "month", "creation_date", "completion_date",'year'])
y_train_log = np.log1p(train_data['Total_n_potholes'])

#X_test = test_data.drop(columns=["Year-Month-Community", "creation_date", "completion_date", "outcome", "month", "Total_n_potholes",'year'])
#y_test = test_data["Total_n_potholes"]

# 2. Train a LightGBM model
lgb_dataset = lgb.Dataset(X_train, label=y_train_log)
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

num_round = 100
model = lgb.train(params, lgb_dataset, num_round)

# Predict using the trained model
y_pred_log = model.predict(test_set, num_iteration=model.best_iteration)
y_pred = np.expm1(y_pred_log)


# 4. RMSE Calculation
#rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#print(f'RMSE: {rmse}')
years = [2017, 2018]
community_area_numbers = list(range(78))
all_combinations = []
for year in years:
    for month in range(1, 13):
        for community in community_area_numbers:
            year_month = f"{year}-{month:02}"
            combination = f"{year_month} in community {community}"
            all_combinations.append(combination)
final = pd.DataFrame({'Year-Month-Community': all_combinations})
# 5. Save results to CSV
final['Total_n_potholes'] = y_pred
final.to_csv('predicted_potholes.csv', index=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000214 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 5589, number of used features: 80
[LightGBM] [Info] Start training from score 5.807824


In [3]:
y_pred

array([ 97.02664215, 498.93048095, 903.71423321, ..., 257.06997868,
        96.82718579, 175.22572548])