In [1]:
# Dependencies and Setup
import pandas as pd
import seaborn as sns
import numpy as np
from datetime import datetime,date

# File to Load
file_to_load = "Resources/US_Unemployment-state.csv"

headers =['state','filed_week_ended','initial_claim','reflecting_week_ended','continued_claims','covered_employment','insured_unemployment_rate']
dtypes = { 'state':np.str_, 'initial_claim':np.float32,'continued_claims':np.float32,'covered_employment':np.float32,'insured_unemployment_rate':np.float32}
parse_dates = ['filed_week_ended','reflecting_week_ended']

# Read file and store into Pandas data frame
df = pd.read_csv(file_to_load,names=headers, dtype=dtypes,parse_dates=parse_dates,skiprows=1)

# Output File (CSV)
output_data_file = "Resources/cleaned_data.csv"

# Preview of the DataFrame
df.head()

Unnamed: 0,state,filed_week_ended,initial_claim,reflecting_week_ended,continued_claims,covered_employment,insured_unemployment_rate
0,Alabama,2015-01-03,7785.0,2014-12-27,30194.0,1794067.0,1.68
1,Alabama,2015-01-10,9139.0,2015-01-03,35246.0,1798083.0,1.96
2,Alabama,2015-01-17,4654.0,2015-01-10,28200.0,1798083.0,1.57
3,Alabama,2015-01-24,3284.0,2015-01-17,26222.0,1798083.0,1.46
4,Alabama,2015-01-31,3203.0,2015-01-24,25391.0,1798083.0,1.41


In [2]:
# Check for correct data types - need to convert
df.dtypes

state                                object
filed_week_ended             datetime64[ns]
initial_claim                       float32
reflecting_week_ended        datetime64[ns]
continued_claims                    float32
covered_employment                  float32
insured_unemployment_rate           float32
dtype: object

In [3]:
# # Remove commas from columns to be able to cast as a number
# df["Initial Claims"] = df["Initial Claims"].str.replace(',', '')
# df["Continued Claims"] = df["Continued Claims"].str.replace(',', '')
# df["Covered Employment"] = df["Covered Employment"].str.replace(',', '')

In [4]:
# # Convert data types
# df['Initial Claims'] = pd.to_numeric(df['Initial Claims'])
# df['Continued Claims'] = pd.to_numeric(df['Continued Claims'])
# df['Covered Employment'] = pd.to_numeric(df['Covered Employment'])

In [5]:
# Verify adjusted data types
df.dtypes

state                                object
filed_week_ended             datetime64[ns]
initial_claim                       float32
reflecting_week_ended        datetime64[ns]
continued_claims                    float32
covered_employment                  float32
insured_unemployment_rate           float32
dtype: object

In [6]:
# Check column names for spaces - looks good
df.columns

Index(['state', 'filed_week_ended', 'initial_claim', 'reflecting_week_ended',
       'continued_claims', 'covered_employment', 'insured_unemployment_rate'],
      dtype='object')

In [7]:
# Identify incomplete rows
df.count()

state                        14946
filed_week_ended             14946
initial_claim                14946
reflecting_week_ended        14946
continued_claims             14946
covered_employment           14946
insured_unemployment_rate    14946
dtype: int64

In [8]:
# Drop all rows with missing information
# df = unemployment_df.dropna(how='any')

In [9]:
# Verify dropped rows
df.count()

state                        14946
filed_week_ended             14946
initial_claim                14946
reflecting_week_ended        14946
continued_claims             14946
covered_employment           14946
insured_unemployment_rate    14946
dtype: int64

In [10]:
# Check states values
df['state'].value_counts()

Illinois                282
Colorado                282
Florida                 282
Massachusetts           282
Mississippi             282
Hawaii                  282
Georgia                 282
South Carolina          282
Oklahoma                282
New Jersey              282
Missouri                282
North Carolina          282
Maryland                282
Texas                   282
Utah                    282
Michigan                282
Ohio                    282
Indiana                 282
Idaho                   282
District of Columbia    282
Arkansas                282
Virgin Islands          282
South Dakota            282
Arizona                 282
Rhode Island            282
Minnesota               282
Nevada                  282
Pennsylvania            282
Washington              282
Montana                 282
Alabama                 282
Connecticut             282
Virginia                282
Delaware                282
Puerto Rico             282
Kansas              

In [11]:
# Display a statistical overview for a high level insight into the data
df.describe()

Unnamed: 0,initial_claim,continued_claims,covered_employment,insured_unemployment_rate
count,14946.0,14946.0,14946.0,14946.0
mean,6943.299,45642.89,2642790.0,1.667128
std,23802.65,108843.6,2999577.0,2.032472
min,0.0,0.0,32446.0,0.0
25%,1094.0,8666.0,652934.0,0.85
50%,2550.0,19919.5,1797624.0,1.24
75%,5608.75,40080.0,3266242.0,1.83
max,1058325.0,4808361.0,17330010.0,31.200001
