## Problem Statement
# Investigate the data however you like and discuss any interesting insights you can find in the data

## Approach
<p style="font-size: 16px"> Attempt to look at the dataset from different angles. Four experiments with data are listed below. </p>

In [1]:
%run 'load_files.ipynb'

In [2]:
# Load packages and configuration
import plotly as py
import plotly.graph_objs as go
import numpy as np
from scipy.stats import chi2_contingency
py.offline.init_notebook_mode(connected=True)

<b style="font-size: 16px">1. It seems like incidents increase during first half of year and gradually decreases in second half. Since the data is of only one year, trend and seasonality cannot be evaluated.</b>

In [3]:
time_freq = pd.to_datetime(main_df["trmt_date"], infer_datetime_format=True).value_counts().to_frame().reset_index()\
.rename(columns={'trmt_date': 'Frequency', 'index': 'trmt_date'})

In [4]:
time_sorted = time_freq.sort_values(by = ['trmt_date'], ascending = True)

In [5]:
data = [go.Scatter(x=time_sorted.trmt_date, y=time_sorted['Frequency'])]
py.offline.iplot(data, filename="time_analysis")

<b style="font-size: 16px">2. Fire Involvement for infants less than a year is pretty high. Males generally undergo more fire incidents than females.</b>

In [6]:
# Load all data pertaining to this answer
disposition_df = load_disposition()
diagnosis_df = load_diagnosis()
# Merging all datasets
merged_df = main_df
merged_df = pd.merge(left=merged_df, right=disposition_df, how='left', on='disposition')
merged_df = pd.merge(left=merged_df, right=diagnosis_df, how='left', on='diag')

DataFrame disposition_df loaded
DataFrame diagnosis_df loaded


In [7]:
fire_df = merged_df[main_df["fmv"] > 0]

In [8]:
fire_all = fire_df['age_man'].value_counts().reset_index().rename(columns={'index': 'age_man', 'age_man': 'Frequency'})\
            .sort_values(by=['age_man'])

In [9]:
fire_male = fire_df[fire_df["sex"]=='Male']['age_man'].value_counts().reset_index().rename(columns={'index': 'age_man', 'age_man': 'Frequency'})\
    .sort_values(by=['age_man'])

In [10]:
fire_female = fire_df[fire_df["sex"]=='Female']['age_man'].value_counts().reset_index().rename(columns={'index': 'age_man', 'age_man': 'Frequency'})\
.sort_values(by=['age_man'])

In [11]:
# Create traces
trace0 = go.Bar(
    x = fire_all['age_man'],
    y = fire_all['Frequency'],
    name = 'All'
)
trace1 = go.Bar(
    x = fire_male['age_man'],
    y = fire_male['Frequency'],
    name = 'Male'
)
trace2 = go.Bar(
    x = fire_female['age_man'],
    y = fire_female['Frequency'],
    name = 'Female'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    title='Fire involvements count vs Age',
    xaxis=dict(
        title='Age',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Count',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='grouped-bar')

<b style="font-size: 16px">3. Females undergo more incidents at home(code 1) and other public properties(code 5).</b>

In [12]:
loc_male = merged_df[merged_df["sex"]=='Male']['location'].value_counts().reset_index().rename(columns={'index': 'location', 'location': 'Frequency'})
loc_female = merged_df[merged_df["sex"]=='Female']['location'].value_counts().reset_index().rename(columns={'index': 'location', 'location': 'Frequency'})

In [13]:
trace1 = go.Bar(
    x = loc_male['location'],
    y = loc_male['Frequency'],
    name = 'Male'
)
trace2 = go.Bar(
    x = loc_female['location'],
    y = loc_female['Frequency'],
    name = 'Female'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    title='Incidents count at different locations.',
    xaxis=dict(
        title='Location',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Count',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='grouped-bar-loc')

<b style="font-size: 16px">4. Count decreases linearly with stratum can be observed.
The difference between male and female incidents is 5507 therefore the probabilty with which they visit a particular stratum is almost equal.</b>

In [14]:
stratum_male = merged_df[merged_df["sex"]=='Male']['stratum'].value_counts().reset_index().rename(columns={'index': 'stratum', 'stratum': 'Frequency'})
stratum_female = merged_df[merged_df["sex"]=='Female']['stratum'].value_counts().reset_index().rename(columns={'index': 'stratum', 'stratum': 'Frequency'})

In [15]:
trace1 = go.Bar(
    x = stratum_male['stratum'],
    y = stratum_male['Frequency'],
    name = 'Male'
)
trace2 = go.Bar(
    x = stratum_female['stratum'],
    y = stratum_female['Frequency'],
    name = 'Female'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    title='Incidents count reported at different stratums.',
    xaxis=dict(
        title='Stratum',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Count',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='grouped-bar-loc')

In [16]:
merged_df["sex"].value_counts()

Male      35503
Female    29996
Name: sex, dtype: int64