In [2]:
import pandas as pd

# Load the provided pickle file
df = pd.read_pickle('shared/Project-3_NYC_311_Calls.pkl')
df = df.set_index(pd.DatetimeIndex(df['Created Date']))
del df['Created Date']


In [3]:
# data exploration
# Look at the first few rows
print(df.head())


                     Unique Key Agency  \
Created Date                             
2011-04-06 00:00:00    20184537    HPD   
2011-04-06 00:00:00    20184538    HPD   
2011-04-06 00:00:00    20184539    HPD   
2022-07-08 11:14:43    54732265   DSNY   
2011-04-06 00:00:00    20184540    HPD   

                                                           Agency Name  \
Created Date                                                             
2011-04-06 00:00:00  Department of Housing Preservation and Develop...   
2011-04-06 00:00:00  Department of Housing Preservation and Develop...   
2011-04-06 00:00:00  Department of Housing Preservation and Develop...   
2022-07-08 11:14:43                           Department of Sanitation   
2011-04-06 00:00:00  Department of Housing Preservation and Develop...   

                           Complaint Type Descriptor         Location Type  \
Created Date                                                                 
2011-04-06 00:00:00           

In [7]:
# Get summary statistics
print(df.describe(include='all'))

          Unique Key    Agency                      Agency Name  \
count   3.378098e+07  33780977                         33780977   
unique           NaN        36                             1888   
top              NaN      NYPD  New York City Police Department   
freq             NaN  10038478                         10036657   
mean    3.802665e+07       NaN                              NaN   
std     1.209574e+07       NaN                              NaN   
min     1.056422e+07       NaN                              NaN   
25%     2.793121e+07       NaN                              NaN   
50%     3.798303e+07       NaN                              NaN   
75%     4.830951e+07       NaN                              NaN   
max     5.840682e+07       NaN                              NaN   

             Complaint Type        Descriptor         Location Type  \
count              33780977          33194300              26640403   
unique                  485              1933        

In [8]:
# Check data types
print(df.dtypes)

Unique Key                 int64
Agency                    object
Agency Name               object
Complaint Type            object
Descriptor                object
Location Type             object
Incident Zip              object
City                      object
Resolution Description    object
Borough                   object
Open Data Channel Type    object
dtype: object


In [9]:
# Analyze complaint types
complaint_types = df['Complaint Type'].value_counts()
print(complaint_types)


Complaint Type
Noise - Residential               3131834
Illegal Parking                   2110646
HEAT/HOT WATER                    1983520
Blocked Driveway                  1439795
Street Condition                  1212154
                                   ...   
c:\windows\win.ini                      1
idexf3mrb7)(!(objectClass=*)            1
%E5%98%8A%E5%98%8DX-Injecti...          1
() { :;}; /bin/sleep 11                 1
Misc. Comments{${sleep(20)}}            1
Name: count, Length: 485, dtype: int64


In [10]:
# Identify the earliest and latest dates
earliest_date = df.index.min()
latest_date = df.index.max()
print(f"Earliest Date: {earliest_date}, Latest Date: {latest_date}")


Earliest Date: 2010-01-01 00:00:00, Latest Date: 2023-08-04 12:00:00


In [11]:
# Q1
df_2022 = df[df.index.year == 2022]
daily_complaints_2022 = df_2022['Unique Key'].resample('D').count()
average_daily_complaints_2022 = daily_complaints_2022.mean()
print(f"Average Daily Complaints in 2022: {average_daily_complaints_2022}")

Average Daily Complaints in 2022: 8684.320547945206


In [12]:
# Q2
daily_complaints = df['Unique Key'].resample('D').count()
max_complaints_date = daily_complaints.idxmax()
max_complaints_count = daily_complaints[max_complaints_date]
print(f"Date with Maximum Calls: {max_complaints_date}")
print(f"Number of Calls: {max_complaints_count}")

Date with Maximum Calls: 2020-08-04 00:00:00
Number of Calls: 24415


In [17]:
daily_complaints.head()

Created Date
2010-01-01    2942
2010-01-02    3958
2010-01-03    5676
2010-01-04    9763
2010-01-05    8735
Freq: D, Name: Unique Key, dtype: int64

In [16]:
# Q3
data_on_max_complaints_date = df[df.index.date == max_complaints_date.date()]
most_common_complaint = data_on_max_complaints_date['Complaint Type'].value_counts().idxmax()
print(f"Most Common Complaint on {max_complaints_date}: {most_common_complaint}")

Most Common Complaint on 2020-08-04 00:00:00: Damaged Tree


In [28]:
# Q4
monthly_complaints_sum = df['Unique Key'].groupby(df.index.month).count()
quietest_month = monthly_complaints_sum.idxmin()
quietest_month_count = monthly_complaints_sum[quietest_month]
print(f"Quietest Month: {quietest_month}")
print(f"Total Number of Calls: {quietest_month_count}")

Quietest Month: 12
Total Number of Calls: 2596986


In [21]:
# Q5
daily_series = df['Unique Key'].resample('D').count()
import statsmodels.api as sm
decomposition = sm.tsa.seasonal_decompose(daily_series, model='additive')
seasonal_component = decomposition.seasonal
seasonal_value_on_20201225 = seasonal_component['2020-12-25']
rounded_seasonal_value = round(seasonal_value_on_20201225)
print(f"Seasonal Component on 2020-12-25: {rounded_seasonal_value}")

Seasonal Component on 2020-12-25: 183


In [22]:
# Q6
daily_series = df['Unique Key'].resample('D').count()
autocorrelation_lag_1 = daily_series.autocorr(lag=1)
print(f"Autocorrelation with a lag of 1: {autocorrelation_lag_1}")

Autocorrelation with a lag of 1: 0.7517059728398577


In [24]:
# Q7
prophet_df = daily_series.reset_index()
prophet_df.columns = ['ds', 'y']
train_df = prophet_df.iloc[:-90]
test_df = prophet_df.iloc[-90:]
from prophet import Prophet
model = Prophet()
model.fit(train_df)
future = model.make_future_dataframe(periods=90)
forecast = model.predict(future)
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred = forecast['yhat'][-90:]
rmse = np.sqrt(mean_squared_error(test_df['y'], y_pred))
print(f"RMSE on Test Set: {rmse}")


01:45:26 - cmdstanpy - INFO - Chain [1] start processing
01:45:27 - cmdstanpy - INFO - Chain [1] done processing


RMSE on Test Set: 1231.513760758433
