In [1]:
# Dependencies and Setup
import pandas as pd
import seaborn as sns
import numpy as np
from datetime import datetime,date

# File to Load
file_to_load = "Resources/US_Unemployment-state.csv"

headers =['state','filed_week_ended','initial_claim','reflecting_week_ended','continued_claims','covered_employment','insured_unemployment_rate']
dtypes = { 'state':np.str_, 'initial_claim':np.float32,'continued_claims':np.float32,'covered_employment':np.float32,'insured_unemployment_rate':np.float32}
parse_dates = ['filed_week_ended','reflecting_week_ended']

# Read file and store into Pandas data frame
df = pd.read_csv(file_to_load,names=headers, dtype=dtypes,parse_dates=parse_dates,skiprows=1)

# Output File (CSV)
output_data_file = "Resources/cleaned_data.csv"

# Preview of the DataFrame
df.head()

Unnamed: 0,state,filed_week_ended,initial_claim,reflecting_week_ended,continued_claims,covered_employment,insured_unemployment_rate
0,Alabama,2015-01-03,7785.0,2014-12-27,30194.0,1794067.0,1.68
1,Alabama,2015-01-10,9139.0,2015-01-03,35246.0,1798083.0,1.96
2,Alabama,2015-01-17,4654.0,2015-01-10,28200.0,1798083.0,1.57
3,Alabama,2015-01-24,3284.0,2015-01-17,26222.0,1798083.0,1.46
4,Alabama,2015-01-31,3203.0,2015-01-24,25391.0,1798083.0,1.41


In [2]:
# Check for correct data types - need to convert
df.dtypes

state                                object
filed_week_ended             datetime64[ns]
initial_claim                       float32
reflecting_week_ended        datetime64[ns]
continued_claims                    float32
covered_employment                  float32
insured_unemployment_rate           float32
dtype: object

In [3]:
# # Remove commas from columns to be able to cast as a number
# df["Initial Claims"] = df["Initial Claims"].str.replace(',', '')
# df["Continued Claims"] = df["Continued Claims"].str.replace(',', '')
# df["Covered Employment"] = df["Covered Employment"].str.replace(',', '')

In [4]:
# # Convert data types
# df['Initial Claims'] = pd.to_numeric(df['Initial Claims'])
# df['Continued Claims'] = pd.to_numeric(df['Continued Claims'])
# df['Covered Employment'] = pd.to_numeric(df['Covered Employment'])

In [5]:
# Verify adjusted data types
df.dtypes

state                                object
filed_week_ended             datetime64[ns]
initial_claim                       float32
reflecting_week_ended        datetime64[ns]
continued_claims                    float32
covered_employment                  float32
insured_unemployment_rate           float32
dtype: object

In [6]:
# Check column names for spaces - looks good
df.columns

Index(['state', 'filed_week_ended', 'initial_claim', 'reflecting_week_ended',
       'continued_claims', 'covered_employment', 'insured_unemployment_rate'],
      dtype='object')

In [7]:
# Identify incomplete rows
df.count()

state                        14893
filed_week_ended             14893
initial_claim                14893
reflecting_week_ended        14893
continued_claims             14893
covered_employment           14893
insured_unemployment_rate    14893
dtype: int64

In [8]:
# Drop all rows with missing information
# df = unemployment_df.dropna(how='any')

In [9]:
# Verify dropped rows
df.count()

state                        14893
filed_week_ended             14893
initial_claim                14893
reflecting_week_ended        14893
continued_claims             14893
covered_employment           14893
insured_unemployment_rate    14893
dtype: int64

In [11]:
# Check states values
df['state'].value_counts()

Illinois                281
Hawaii                  281
New Jersey              281
South Dakota            281
Arizona                 281
North Dakota            281
North Carolina          281
Louisiana               281
Maine                   281
Virgin Islands          281
Washington              281
Colorado                281
Virginia                281
New Mexico              281
Wyoming                 281
Arkansas                281
Ohio                    281
Michigan                281
California              281
Delaware                281
Indiana                 281
Puerto Rico             281
Idaho                   281
Massachusetts           281
Pennsylvania            281
Wisconsin               281
New Hampshire           281
Tennessee               281
West Virginia           281
Nebraska                281
Maryland                281
South Carolina          281
Texas                   281
Mississippi             281
Nevada                  281
Vermont             

In [None]:
# Display a statistical overview for a high level insight into the data
df.describe()