In [1]:
import pandas as pd

# Load the data from the pickle file
df = pd.read_pickle('shared/Project-3_NYC_311_Calls.pkl')

# Set 'Created Date' as the index
df = df.set_index(pd.DatetimeIndex(df['Created Date']))
del df['Created Date']


In [2]:
# Display basic information about the dataframe
df.info()

# Display the first few rows of the dataframe
df.head()

# Check for missing values
df.isnull().sum()

# Summary statistics
df.describe()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 33780977 entries, 2011-04-06 00:00:00 to 2011-04-06 00:00:00
Data columns (total 11 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   Unique Key              int64 
 1   Agency                  object
 2   Agency Name             object
 3   Complaint Type          object
 4   Descriptor              object
 5   Location Type           object
 6   Incident Zip            object
 7   City                    object
 8   Resolution Description  object
 9   Borough                 object
 10  Open Data Channel Type  object
dtypes: int64(1), object(10)
memory usage: 3.0+ GB


Unnamed: 0,Unique Key
count,33780980.0
mean,38026650.0
std,12095740.0
min,10564220.0
25%,27931210.0
50%,37983030.0
75%,48309510.0
max,58406820.0


In [3]:
# Earliest date
earliest_date = df.index.min()

# Latest date
latest_date = df.index.max()

print(f'Earliest Date: {earliest_date}')
print(f'Latest Date: {latest_date}')


Earliest Date: 2010-01-01 00:00:00
Latest Date: 2023-08-04 12:00:00


In [4]:
# Resample the data on a daily basis using the 'Unique Key' column (assuming it has no missing values)
daily_complaints = df['Unique Key'].resample('D').count()

# Select data for the year 2022
complaints_2022 = daily_complaints['2022']

# Calculate the average
average_daily_complaints_2022 = complaints_2022.mean()

print(f'The average number of daily complaints received in 2022 is: {average_daily_complaints_2022}')


The average number of daily complaints received in 2022 is: 8684.320547945206


In [5]:
# Find the complaint type with the maximum number of calls on the date with the maximum calls
max_calls_date = df['Unique Key'].resample('D').count().idxmax()
most_common_complaint_type = df[df.index.date == max_calls_date.date()]['Complaint Type'].value_counts().idxmax()

print(f"On the date with the maximum number of calls ({max_calls_date.date()}), the most important complaint type was: {most_common_complaint_type}")


On the date with the maximum number of calls (2020-08-04), the most important complaint type was: Damaged Tree


In [6]:
# Group the data by month and count the number of unique keys for each month
monthly_calls = df['Unique Key'].resample('M').count()

# Count the monthly average across all years
monthly_average_calls = monthly_calls.groupby(monthly_calls.index.month).mean()

# Identify the month with the fewest number of calls
quietest_month = monthly_average_calls.idxmin()

print(f"The quietest month historically is: {quietest_month}")

The quietest month historically is: 2


In [7]:
import statsmodels.api as sm

# Resample the time series to daily frequency
daily_calls = df['Unique Key'].resample('D').count()

# Perform ETS decomposition based on an additive model
result = sm.tsa.seasonal_decompose(daily_calls, model='additive')

# Extract the seasonal component for the specified date
seasonal_component_on_date = result.seasonal['2020-12-25']

print(f"The value of the seasonal component on 2020-12-25 is: {round(seasonal_component_on_date)}")


The value of the seasonal component on 2020-12-25 is: 183


In [8]:
# Calculate the autocorrelation with a lag of 1 (day prior)
autocorrelation_lag_1 = daily_calls.autocorr(lag=1)

print(f"The autocorrelation with a lag of 1 is: {autocorrelation_lag_1:.2f}")


The autocorrelation with a lag of 1 is: 0.75


In [9]:
from prophet import Prophet
from sklearn.metrics import mean_squared_error
import numpy as np

# Resetting the index to work with Prophet
df_prophet = daily_calls.reset_index()
df_prophet.columns = ['ds', 'y']

# Split the data into training and test sets
train = df_prophet[:-90]
test = df_prophet[-90:]

# Create and fit the Prophet model
model = Prophet()
model.fit(train)

# Create a dataframe with the dates for the forecast period
future = model.make_future_dataframe(periods=90)

# Generate the forecast
forecast = model.predict(future)

# Extract the predicted values for the test set
y_pred = forecast[-90:]['yhat'].values

# Extract the true values for the test set
y_true = test['y'].values

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print(f"The RMSE on the test set is approximately: {rmse:.0f}")


21:44:46 - cmdstanpy - INFO - Chain [1] start processing
21:44:47 - cmdstanpy - INFO - Chain [1] done processing


The RMSE on the test set is approximately: 1232
