In [188]:
import pandas as pd
import numpy as np
from sodapy import Socrata

# Exploratory Data Analysis
from pandas_profiling import ProfileReport

# Visualization
# import seaborn as sns
# import matplotlib.pyplot as plt


## Notebook Options

In [408]:
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

def start():
    options = {
        'display': {
            'max_columns': None,
            'max_colwidth': -1,
            'expand_frame_repr': False,  # Don't wrap to multiple pages
            'max_rows': 10,
            'max_seq_items': 25,         # Max length of printed sequence
            'precision': 5,
            'show_dimensions': False
        },
        'mode': {
            'chained_assignment': None   # Controls SettingWithCopyWarning
        }
    }

    for category, option in options.items():
        for op, value in option.items():
            pd.set_option(f'{category}.{op}', value)  # Python 3.6+

if __name__ == '__main__':
    start()

del start  # Clean up namespace in the interpreter

In [53]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata('data.sfgov.org', None)

# Authenticate the client
domain = 'data.sfgov.org'
app_token = 'tZuHs7ko1xQezRBAgUN10D0i9'
username = 'corraljrmiguel@gmail.com'
password = 'Xd9@rzwPhZjKsNg8cima'

client = Socrata(domain,
                 app_token,
                 username=username,
                 password=password)

# Returns as JSON from API
# converted to Python list of dictionaries by sodapy.
results = client.get('vw6y-z8j6', limit=10000)
# results = client.get('vw6y-z8j6', where='service_request_id=12167455')

# Convert to pandas dataframe
df = pd.DataFrame.from_records(results)



In [60]:
df.shape

(10000, 47)

In [430]:
# # Pickle dataframe
# df.to_pickle('data/df_raw_10k.pkl')

# Load dataframe
df = pd.read_pickle('data/df_raw_10k.pkl')

In [431]:
# Drop all 'computed_region' columns
df = df.loc[:, ~df.columns.str.contains('^:@')]

# Drop 'point' column
df.drop('point', axis=1, inplace=True)

df.shape

(10000, 19)

In [432]:
def reorder_time_column(df):
    """Reorder 'closed_date' column"""
    cols = df.columns.to_list()
    cols = cols[:3] + cols[-1:] + cols[3:-1]
    df = df[cols]
    return df

df = reorder_time_column(df)

In [433]:
# Create is_duplicate column
def create_is_duplicate(df):
    '''Create 'is_duplicate' column'''
    
    # Search for 'duplicate' in status_notes
    df['is_duplicate'] = df['status_notes'].str.lower().str.contains('duplicate')
    
    # Fill 'nan' values with False
    df['is_duplicate'].fillna(False, inplace=True)
    
    return df

create_is_duplicate(df)
df.head(5)

Unnamed: 0,service_request_id,requested_datetime,updated_datetime,closed_date,status_description,status_notes,agency_responsible,service_name,service_subtype,service_details,address,street,supervisor_district,neighborhoods_sffind_boundaries,police_district,lat,long,source,media_url,is_duplicate
0,12190946,2020-03-09T01:11:00.000,2020-03-09T01:13:08.000,,Open,,Noise Report Queue,Noise Report,other_excessive_noise,Noise Report - other_excessive_noise,"1005 MARKET ST, SAN FRANCISCO, CA, 94103",MARKET ST,6,South of Market,TENDERLOIN,37.781860351563,-122.410179138184,Phone,,False
1,12190944,2020-03-09T01:10:00.000,2020-03-09T03:34:13.000,,Open,closed,Parking Enforcement Dispatch Queue,Parking Enforcement,Other_Illegal_Parking,Unknown - Mercedes Benz - Unknown,"37 PHELAN AVE, SAN FRANCISCO, CA, 94112 (Virtual)",PHELAN AVE,7,Sunnyside,INGLESIDE,37.724251469399,-122.452882728836,Web,,False
2,12190940,2020-03-09T01:08:03.000,2020-03-09T01:08:04.000,,Open,open,Entertainment Commission - G,Noise Report,entertainment,Noise Report - entertainment,"752 LARKIN ST, SAN FRANCISCO, CA, 94109",LARKIN ST,6,Tenderloin,TENDERLOIN,37.78490415,-122.41786379,Mobile/Open311,,False
3,12190935,2020-03-09T01:02:48.000,2020-03-09T01:02:51.000,,Open,,DPT Abandoned Vehicles Work Queue,Abandoned Vehicle,Abandoned Vehicles,DPT Abandoned Vehicles Low,"790 43RD AVE, SAN FRANCISCO, CA, 94121",43RD AVE,1,Outer Richmond,RICHMOND,37.7737818,-122.5034043,Web,{'url': 'http://mobile311.sfgov.org/reports/12190935/photos'},False
4,12190918,2020-03-09T00:41:23.000,2020-03-09T02:10:01.000,,Open,accepted,DPW Ops Queue,Litter Receptacles,Add_remove_garbage_can,,Intersection of WAGNER ALY and END (000 BLOCK OF),WAGNER ALY,6,Tenderloin,TENDERLOIN,37.783569,-122.413326,Web,,False


In [434]:
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

(using `df.profile_report(correlations={"cramers": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/pandas-profiling/pandas-profiling/issues
(include the error message: 'The internally computed table of expected frequencies has a zero element at (0, 4).')
  correlation_name=correlation_name, error=error


- [x] Find all duplicates
- [x] Create column for duplicates
- [ ] Apply dates to datetime columns
- [ ] Create base model
- [ ] Feature selection
- [ ] Feature engineering
    * Cosine similarity of service_subtype
    * Time of day
    * Date
    * Block type / building permit
    

In [446]:
for col in df.columns:
    print(col)

service_request_id
requested_datetime
updated_datetime
closed_date
status_description
status_notes
agency_responsible
service_name
service_subtype
service_details
address
street
supervisor_district
neighborhoods_sffind_boundaries
police_district
lat
long
source
media_url
is_duplicate


In [317]:
# Convert timestamps (strings) to date format
df.iloc[:, 1:4].to_timedelta()

AttributeError: 'DataFrame' object has no attribute 'to_timedelta'