In [1]:
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from IPython.display import display

In [2]:
URL = "https://dsci551-final-project-fd95f-default-rtdb.firebaseio.com/"
json_suffix = '.json'

## Function Definition

In [3]:
# GET MONTH DATA
def get_json(category, year, month):
    getURL = URL + category + '/' + year + '/' + str(int(month)) + json_suffix
    #print(getURL)
    response = requests.get(getURL)
    return json.loads(response.text)

In [4]:
# GET YEAR DATA
def get_year_json(category, year):
    getURL = URL + category + '/' + year + '/' + json_suffix
    #print(getURL)
    response = requests.get(getURL)
    return json.loads(response.text)

In [5]:
def transform_to_json(df):
    json = df.to_json(orient = 'records')
    return json

In [6]:
def upload_data(url, index, year, month, date, data):
    try:
        database_URL = url + index + '/' + str(year) + '/' + str(month) + '/' + date  + json_suffix
        response = requests.put(database_URL, data)
    except:
        print("Upload Failed")

In [7]:
# Load Firebase data To DataFrame
def load_from_firebase(dataset):
    all_df = list()
    for year in dataset:
        for month in dataset[year]:
            if month != None:
                for day in month:
                    #date_list.append(day)
                    #create dataframe for every day
                    all_day_data = pd.DataFrame.from_dict(month[day])
                    all_df.append(all_day_data)
    all_dataframe = pd.concat(all_df)
    all_dataframe = all_dataframe.reset_index(drop=True)
    return all_dataframe

In [8]:
def create_dataframe(query_data, is_month):
    date_list = list()
    totalcases = list()
    without_NA_cases = list()
    monthly_df = list()
    test = list()

    for day in query_data:
        #create index
        date_list.append(day)
        #create dataframe for every day
        all_day_data = pd.DataFrame.from_dict(query_data[day])
        #calculate cases amount per day
        totalcases.append(len(all_day_data))
        #store daily dataframe into list
        monthly_df.append(all_day_data)

        test.append(all_day_data.isnull().any(axis=1).sum())

    #create monthly dataframe
    monthly_frame = pd.concat(monthly_df)
    monthly_frame = monthly_frame.reset_index(drop=True)
    
    if is_month:
        print('There are ' + str(sum(totalcases))+ ' cases in total this month'+ ', and ' + 
              str((sum(totalcases)-sum(test))) + ' among the data are without NA.')
    
    return monthly_frame

### Crime Plot Functions

In [9]:
def plot_by_time(df):
    # convert crime_time into "hour"(0-24)
    df['crime_time_byhour'] = pd.Series.to_frame((df['crime_time'])//100)
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.barplot(x=df['crime_time_byhour'].value_counts().index, y=df['crime_time_byhour'].value_counts())
    ax.set_title("Total Crimes Reported by Time")
    ax.set_xlabel("Time(hour) of the day")
    ax.set_ylabel("Total Crimes Reported")
    sns.despine()

In [10]:
def plot_by_area(df):
    fig, ax = plt.subplots(figsize=(15, 5))

    sns.barplot(x = df["area_name"].value_counts().index, y = df["area_name"].value_counts(), ax=ax)
    ax.set_title("Crimes by Area")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=-45)
    ax.set_xlabel("Area Name")
    ax.set_ylabel("Total Crimes Occurring")
    # Adding Values
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % round(int(p.get_height())), 
                fontsize=9, color='black', ha='center', va='bottom')
    sns.despine()

In [11]:
def plot_by_age(df):
    
    vict_age_0 = df[df['vict_age'] == 0].index
    df.drop(vict_age_0, inplace=True)
    #df.reset_index(drop=True, inplace=True)
    
    fig, ax = plt.subplots(figsize=(15, 5))
    df["vict_age_by5"] = 5* ( pd.to_numeric(df['vict_age'])//5)
    sns.barplot(x = df["vict_age_by5"].value_counts().index, y = df["vict_age_by5"].value_counts(), ax=ax)
    ax.set_title("Distribution of Vict Age")
    ax.set_xlabel("Victom Age")
    ax.set_ylabel("Total Crimes Occurring")
    sns.despine()

### Records Tracking Functions

In [12]:
def upload_plot(purpose, category, year, month, plot):
    temp_dict = dict()
    temp_dict['purpose'] = purpose
    temp_dict['category'] = category
    temp_dict['year'] = year
    temp_dict['month'] = month
    temp_dict['plot_by'] = plot
    
    df = pd.DataFrame.from_dict([temp_dict])
    json_data = transform_to_json(df)
    
    upload_record(json_data)

In [13]:
def upload_pred(purpose, year, month, date, algorithms):
    temp_dict = dict()
    temp_dict['purpose'] = purpose
    temp_dict['year'] = year
    temp_dict['month'] = month
    temp_dict['date'] = date
    temp_dict['algorithms'] = algorithms

    df = pd.DataFrame.from_dict([temp_dict])
    json_data = transform_to_json(df)
    
    upload_record(json_data)

In [14]:
def upload_record(data):
    try:
        database_URL = URL + 'records' + '/' + json_suffix
        response = requests.put(database_URL, data)
        
    except:
        print("Upload Failed")

### Covid Plot Functions

In [15]:
def plot_by_test_pos(df, is_month):
    
    if is_month:
        fig, ax = plt.subplots(figsize=(15, 5))

        sns.barplot(x = df["day"], y = df["tests_pos"], ax=ax)
        ax.set_title("Tests of the Month")
        ax.set_xticklabels(ax.get_xticklabels(), rotation=-45)
        ax.set_xlabel("Date")
        ax.set_ylabel("Total Tests")
        # Adding Values
        for p in ax.patches:
            ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % round(int(p.get_height())), 
                    fontsize=9, color='black', ha='center', va='bottom')
        sns.despine()
    else:
        fig, ax = plt.subplots(figsize=(15, 5))
        # Plotting crimes reported by day
        sns.barplot(x=df['month'].value_counts().index.sort_values(ascending=True), y=df[['month', 'tests_pos']].groupby('month').sum().squeeze())
        
        # Axes
        ax.set_title("Tests of Month")
        ax.set_xlabel("Month of the Year")
        ax.set_ylabel("Total Tests Reported")
        # Adding values
        for p in ax.patches:
            ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % round(int(p.get_height())), fontsize=9, color='black', ha='center', va='bottom')
        sns.despine()

In [16]:
def plot_by_cumu_test_pos(df):
    fig, ax = plt.subplots(figsize=(15, 5))
    # Plotting crimes reported by day
    sns.barplot(x = df['month'].value_counts().index.sort_values(ascending=True), y = df[['month', 'day', 'cumulative_tests_pos']].groupby('month').max('day').cumulative_tests_pos)
    # Axes
    ax.set_title("Tests of Month")
    ax.set_xlabel("Month of the Year")
    ax.set_ylabel("Total Tests Reported")
    # Adding values
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % round(int(p.get_height())), fontsize=9, color='black', ha='center', va='bottom')
    sns.despine()

In [17]:
def plot_by_pos_test_percent(df):
    fig, ax = plt.subplots(figsize=(15, 5))
    # Plotting crimes reported by day
    sns.barplot(x=df['month'].value_counts().index.sort_values(ascending=True), y=df[['month', 'percent_positive_tests_cum']].groupby('month').sum().squeeze())
    # Axes
    ax.set_title("Tests of Month")
    ax.set_xlabel("Month of the Year")
    ax.set_ylabel("Total Tests Reported")
    # Adding values
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%f' % p.get_height(), fontsize=9, color='black', ha='center', va='bottom')
    sns.despine()

## User Searching

### Plotting

## Data preproceesing fo prediction

### Load data from firebase

In [18]:
# load all crime data from firebase
crime_data = pd.DataFrame()
crime_data_2020 = get_year_json('crime', '2020')
crime_data_2021= get_year_json('crime', '2021')
for month_data in crime_data_2020:
    if month_data is not None:
        df = create_dataframe(month_data, False)
        crime_data = crime_data.append(df)
for month_data in crime_data_2021:
    if month_data is not None:
        df = create_dataframe(month_data, False)
        crime_data = crime_data.append(df)
        
# to set the datetime
crime_data['date'] = pd.to_datetime(crime_data[['month', 'day', 'year']])
crime_data['weekday'] = crime_data['date'].dt.dayofweek

# drop rows with age ==0
crime_data.reset_index(drop=True, inplace=True)
crime_data.drop(crime_data[crime_data["vict_age"]==0].index, inplace=True)
crime_data.reset_index(drop=True, inplace=True)

crime_count = crime_data.groupby('date').size().reset_index(name='Crime_Count')

In [19]:
# load all covid data from firebase
covid_data = pd.DataFrame()
covid_data_2020 = get_year_json('covid', '2020')
covid_data_2021= get_year_json('covid', '2021')
for month_data in covid_data_2020:
    if month_data is not None:
        df = create_dataframe(month_data, False)
        covid_data = covid_data.append(df)
for month_data in covid_data_2021:
    if month_data is not None:
        df = create_dataframe(month_data, False)
        covid_data = covid_data.append(df)

covid_data = covid_data.dropna()
covid_data['date'] = pd.to_datetime(covid_data[['month', 'day', 'year']])
covid_data['weekday'] = covid_data['date'].dt.dayofweek

### combine dataset and split into training and testing

In [20]:
combined_data = covid_data.merge(crime_count, how = 'inner', left_on='date', right_on='date')

# prepare train, test data for modeling
x_data = combined_data.drop(['weekday', 'date', 'Crime_Count', 'percent_positive_tests_cum', 'percent_positive_tests', 'percent_positive_avg_tests'], axis=1).values
y_data = combined_data['Crime_Count'].values

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size= 0.2, random_state = 0 )

In [21]:
def LR(x_train, data, y_train, accurate_crime_count):
    from sklearn.linear_model import LinearRegression
    reg = LinearRegression()
    reg.fit(x_train, y_train)
    y_linear_pred = reg.predict(data)
    print("Predict Count: "+ str(int(y_linear_pred[0])))
    print("Accurate Count: "+ str(int(accurate_crime_count[0])))
    score = reg.score(x_test, y_test)

In [22]:
def SVM(x_train, data, y_train, accurate_crime_count):
    from sklearn.svm import SVC
    svm = SVC()
    svm.fit(x_train, y_train)
    y_svm_pred = svm.predict(data)
    #print(svm.score(x_train, y_train))
    score = svm.score(x_test, y_test)
    print("Predict Count: "+ str(int(y_svm_pred[0])))
    print("Accurate Count: "+ str(int(accurate_crime_count[0])))

In [23]:
def RDF(x_train, data, y_train, accurate_crime_count):
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    regressor.fit(x_train, y_train)
    y_rdm_pred = regressor.predict(data)
    score = regressor.score(x_test, y_test)
    #print(score)
    print("Predict Count: "+ str(int(y_rdm_pred[0])))
    print("Accurate Count: "+ str(int(accurate_crime_count[0]))) 

In [24]:
@interact
def user_searching_select(purpose = ['plot','predict crime amount','correlation']):
    if purpose == 'plot':
        # Plotting
        @interact
        def user_searching_plotting(category = ['crime', 'covid'], year = ['2020', '2021'], 
                                 month = ['01','02','03','04','05','06','07','08','09','10','11','12','whole year']):
            global query_data
            global select_df
            
            # loading data
            try:
                # whole year data
                if month == 'whole year':
                    select_df = pd.DataFrame()
                    query_data = get_year_json(category, year)
                    #print(query_data)
                    for month_data in query_data:
                        if month_data is not None:
                            df = create_dataframe(month_data, False)
                            select_df = select_df.append(df)
                            select_df.dropna(subset=['tests'], how='all', inplace=True)
                    print("There are " + str(len(select_df)) + " data this year.")
                # specific month data
                else:        
                    query_data = get_json(category, year, month)
                    #print(query_data)
                    print('In this month, we found relevant information for ' + str(len(query_data)) + " days")
                    select_df = create_dataframe(query_data, True)

            except:
                print('There are no data matches in the dataset')


            if category == 'crime':
                select_df['crime_time'] = pd.to_numeric(select_df['crime_time'])
                @interact
                def test(plot = ['Time', 'Area Name', 'Vict Age']):
                    if plot == 'Time':
                        plot_by_time(select_df)
                    if plot == 'Area Name':
                        plot_by_area(select_df)
                    if plot =='Vict Age':
                        plot_by_age(select_df)
                
                    upload_plot(purpose, category, year, month, plot)

            if (category == 'covid') and (month == 'whole year'):
                @interact
                def test(plot = ['Positive Test', 'Positive Cumulative Test', 'Positive Tests(%)']):
                    if month == 'whole year':
                        if plot == 'Positive Test':
                            plot_by_test_pos(select_df, False)
                        if plot == 'Positive Cumulative Test':
                            plot_by_cumu_test_pos(select_df)
                        if plot == 'Positive Tests(%)':
                            plot_by_pos_test_percent(select_df)
                    
                    upload_plot(purpose, category, year, month, plot)
                
            elif (category == 'covid') and (month != 'whole year'):
                @interact
                def test(plot = ['Positive Test']):
                    try:
                        if plot == 'Positive Test':
                            plot_by_test_pos(select_df, True)
                    except:
                        print('There are no data matches in the dataset')
                        
                    upload_plot(purpose, category, year, month, plot)
        
    if purpose == 'predict crime amount':
        @interact
        def choose_algo(year = ['2020', '2021'], month = ['01','02','03','04','05','06','07','08','09','10','11','12'],
                   date = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16',
                          '17','18','19','20','21','22','23','24','25','26','27','28','29','30','31'],
                        algorithms = ['Random Forest', 'Linear Regression', 'SVM']):
            
            try:
                data = combined_data[combined_data['date'] == year+month+date].drop(['weekday', 'date', 'Crime_Count', 'percent_positive_tests_cum', 'percent_positive_tests', 'percent_positive_avg_tests'], axis=1).values      
                accurate_crime_count = combined_data[combined_data['date'] == year+month+date]['Crime_Count'].values
                if algorithms == 'Linear Regression':
                    LR(x_train, data, y_train, accurate_crime_count)
                    upload_pred(purpose, year, month, date, algorithms)
                    
                if algorithms == 'SVM':
                    SVM(x_train, data, y_train, accurate_crime_count)
                    upload_pred(purpose, year, month, date, algorithms)
                    
                if algorithms == 'Random Forest':
                    RDF(x_train, data, y_train, accurate_crime_count)
                    upload_pred(purpose, year, month, date, algorithms)
                    
            except:
                print("no data on that date!")
                
    if purpose == 'correlation':
        print("Correlation between crime amounts and other factors: \n")
        print(combined_data.corr()['Crime_Count'])

interactive(children=(Dropdown(description='purpose', options=('plot', 'predict crime amount', 'correlation'),…