# EDA on DACA Data

This is an exploratory data analysis on DACA March 2017 data that was given to us in our Data Mining class at George Washington University. Below is a high level overview of the dataset:

In [1]:
import plotly as py
import plotly.figure_factory as ff
import pandas as pd
py.tools.set_credentials_file(username='mwilchek', api_key='ZoEYFqeYICM56uCTBJVh')
py.offline.init_notebook_mode(connected=True)
daca_v2 = pd.read_csv("https://raw.githubusercontent.com/mwilchek/Data-Mining/master/DACA%20March%202017%20V2.csv")
table = ff.create_table(daca_v2.head())
py.offline.iplot(table, filename='jupyter/summary_table')
daca_v2.describe()

Unnamed: 0,Accepted Initial Applications,Accepted Renewals,Accepted Total,Approved Initial Applications,Approved Renewals,Approved Total,Population (2016),Population Percentage,Rejected Initial Applications Percentage
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,16163.666667,15046.740741,31213.740741,14449.62963,13789.740741,28239.37037,6050828.0,0.309236,14.786685
std,37941.805411,33930.883164,71780.887243,34484.021618,31511.64558,65925.234507,7144772.0,0.233924,8.270255
min,56.0,186.0,248.0,42.0,162.0,204.0,102951.0,0.017312,6.373973
25%,1072.25,1456.0,2931.0,880.25,1348.75,2420.0,1492203.0,0.118266,9.056009
50%,6303.0,5798.5,12691.5,5494.0,5309.0,11299.5,4008513.0,0.278098,12.017177
75%,11915.0,12426.0,23461.0,10920.75,10901.25,20843.75,6901248.0,0.441229,17.519654
max,242339.0,217023.0,459362.0,222795.0,202200.0,424995.0,39250020.0,1.082789,40.880503


Based on the summarized data above, it may be interesting to note that the average accepted initial application numbers (~16k), only about an average (~14.5k) of applications were actually approved. Perhaps, by mapping our data by state we will see which state may be responsible for more approved applications.

In [1]:
import plotly as py

py.tools.set_credentials_file(username='mwilchek', api_key='ZoEYFqeYICM56uCTBJVh')
py.offline.init_notebook_mode(connected=True)
import pandas as pd

daca_v2 = pd.read_csv('https://raw.githubusercontent.com/mwilchek/Data-Mining/master/DACA%20March%202017%20V2.csv')

for col in daca_v2.columns:
    daca_v2[col] = daca_v2[col].astype(str)

scl = [[0.0, 'rgb(255,255,204)'], [0.2, 'rgb(161,218,180)'], [0.4, 'rgb(127,205,187)'], \
       [0.6, 'rgb(65,182,196)'], [0.8, 'rgb(44,127,184)'], [1.0, 'rgb(37,52,148)']]

daca_v2['text'] = daca_v2['State'] + '<br>' + \
                  ' Accepted Initial Applications ' + daca_v2['Accepted Initial Applications'] + '<br>' + \
                  ' Accepted Renewals ' + daca_v2['Accepted Renewals'] + '<br>' + \
                  ' Accepted Total ' + daca_v2['Accepted Total'] + '<br>' + \
                  ' Approved Initial Applications ' + daca_v2['Approved Initial Applications'] + '<br>' + \
                  ' Approved Renewals ' + daca_v2['Approved Renewals'] + '<br>' + \
                  ' Approved Total ' + daca_v2['Approved Total']

data = [dict(
    type='choropleth',
    colorscale=scl,
    autocolorscale=False,
    locations=daca_v2['code'],
    z=daca_v2['Approved Total'].astype(int),
    locationmode='USA-states',
    text=daca_v2['text'],
    marker=dict(
        line=dict(
            color='rgb(255,255,255)',
            width=3
        )),
    colorbar=dict(
        title="Total Approved Applications")
)]

layout = dict(
    title='2017 DACA Application Data<br>(Hover for breakdown)',
    geo=dict(
        scope='usa',
        projection=dict(type='albers usa'),
        showlakes=True,
        lakecolor='rgb(255, 255, 255)'),
)

fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='d3-cloropleth-map')

In [None]:
import plotly as py
import plotly.graph_objs as go
import numpy as np
import pandas as pd

py.tools.set_credentials_file(username='mwilchek', api_key='ZoEYFqeYICM56uCTBJVh')
py.offline.init_notebook_mode(connected=True)

l = []
y = []

# Setting colors for plot.
N = 53
c = ['hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 360, N)]
daca_v2 = pd.read_csv('https://raw.githubusercontent.com/mwilchek/Data-Mining/master/DACA%20March%202017%20V2.csv')

for i in range(int(N)):
    y.append((2000 + i))
    trace0 = go.Scatter(
        x=daca_v2['Approved Total'],
        y=daca_v2['Population (2016)'],
        mode='markers',
        marker=dict(size=14,
                    line=dict(width=1),
                    color=c[i],
                    opacity=0.3
                    ), name=y[i],
        text=daca_v2['State'])  # The hover text goes here... 
    l.append(trace0);

layout = go.Layout(
    title='Approved DACA Applications vs. State Population',
    hovermode='closest',
    xaxis=dict(
        title='Approved Total Applications',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
    yaxis=dict(
        title='State Population',
        ticklen=5,
        gridwidth=2,
    ),
    showlegend=False
)
fig = go.Figure(data=l, layout=layout)
py.offline.iplot(fig)

Based on the above scatterplot, that even though Califnornia has the largest population they continue to approve more applications than any other state. Texas, New York, Florida and Illinois are trying to show their support as well for DACA, while the other states seem not clearly not care as much.