In [1]:
# Dependencies and Setup
import pandas as pd
import seaborn as sns
import numpy as np
import datetime as dt

# File to Load
file_to_load = "../Resources/US_Unemployment-state.csv"

headers =['state','filed_week_ended','initial_claim','reflecting_week_ended','continued_claims','covered_employment','insured_unemployment_rate']
dtypes = { 'state':np.str_, 'initial_claim':np.float32,'continued_claims':np.float32,'covered_employment':np.float32,'insured_unemployment_rate':np.float32}
parse_dates = ['filed_week_ended','reflecting_week_ended']

# Read file and store into Pandas data frame
df = pd.read_csv(file_to_load,names=headers, dtype=dtypes,parse_dates=parse_dates,skiprows=1)

# Output File (CSV)
output_data_file = "Resources/cleaned_data.csv"

# Preview of the DataFrame
df.head()

  import pandas.util.testing as tm


Unnamed: 0,state,filed_week_ended,initial_claim,reflecting_week_ended,continued_claims,covered_employment,insured_unemployment_rate
0,Alabama,2015-01-03,7785.0,2014-12-27,30194.0,1794067.0,1.68
1,Alabama,2015-01-10,9139.0,2015-01-03,35246.0,1798083.0,1.96
2,Alabama,2015-01-17,4654.0,2015-01-10,28200.0,1798083.0,1.57
3,Alabama,2015-01-24,3284.0,2015-01-17,26222.0,1798083.0,1.46
4,Alabama,2015-01-31,3203.0,2015-01-24,25391.0,1798083.0,1.41


In [2]:
# Check for correct data types - need to convert
df.dtypes

state                                object
filed_week_ended             datetime64[ns]
initial_claim                       float32
reflecting_week_ended        datetime64[ns]
continued_claims                    float32
covered_employment                  float32
insured_unemployment_rate           float32
dtype: object

In [3]:
# # Remove commas from columns to be able to cast as a number
# df["Initial Claims"] = df["Initial Claims"].str.replace(',', '')
# df["Continued Claims"] = df["Continued Claims"].str.replace(',', '')
# df["Covered Employment"] = df["Covered Employment"].str.replace(',', '')

In [4]:
# # Convert data types
# df['Initial Claims'] = pd.to_numeric(df['Initial Claims'])
# df['Continued Claims'] = pd.to_numeric(df['Continued Claims'])
# df['Covered Employment'] = pd.to_numeric(df['Covered Employment'])

In [5]:
# Verify adjusted data types
df.dtypes

state                                object
filed_week_ended             datetime64[ns]
initial_claim                       float32
reflecting_week_ended        datetime64[ns]
continued_claims                    float32
covered_employment                  float32
insured_unemployment_rate           float32
dtype: object

In [6]:
# Check column names for spaces - looks good
df.columns

Index(['state', 'filed_week_ended', 'initial_claim', 'reflecting_week_ended',
       'continued_claims', 'covered_employment', 'insured_unemployment_rate'],
      dtype='object')

In [7]:
# Identify incomplete rows
df.count()

state                        14893
filed_week_ended             14893
initial_claim                14893
reflecting_week_ended        14893
continued_claims             14893
covered_employment           14893
insured_unemployment_rate    14893
dtype: int64

In [8]:
# Drop all rows with missing information
# df = unemployment_df.dropna(how='any')

In [9]:
# Verify dropped rows
df.count()

state                        14893
filed_week_ended             14893
initial_claim                14893
reflecting_week_ended        14893
continued_claims             14893
covered_employment           14893
insured_unemployment_rate    14893
dtype: int64

In [10]:
# Check states values
df['state'].value_counts()

Arizona                 281
Delaware                281
Louisiana               281
Alabama                 281
Mississippi             281
Kentucky                281
Hawaii                  281
Rhode Island            281
South Carolina          281
Colorado                281
Nevada                  281
Virgin Islands          281
Montana                 281
Oklahoma                281
Vermont                 281
Oregon                  281
Iowa                    281
Maryland                281
Indiana                 281
Missouri                281
Wisconsin               281
Utah                    281
Washington              281
Florida                 281
New Jersey              281
South Dakota            281
New Mexico              281
New York                281
Arkansas                281
District of Columbia    281
Alaska                  281
West Virginia           281
Puerto Rico             281
Kansas                  281
Texas                   281
Virginia            

In [11]:
# Display a statistical overview for a high level insight into the data
df.describe()

Unnamed: 0,initial_claim,continued_claims,covered_employment,insured_unemployment_rate
count,14893.0,14893.0,14893.0,14893.0
mean,6839.409,44538.86,2642390.0,1.626655
std,23610.19,104460.9,2998995.75,1.898211
min,0.0,0.0,32446.0,0.0
25%,1090.0,8645.0,652934.0,0.85
50%,2538.0,19830.0,1797624.0,1.24
75%,5572.0,39682.0,3266242.0,1.82
max,1058325.0,4808361.0,17330010.0,31.200001


## ANALYZE

In [12]:
df['File_Month'] = df['filed_week_ended'].dt.month
df['File_Day'] = df['filed_week_ended'].dt.day
df['File_Year'] = df['filed_week_ended'].dt.year

In [13]:
df.head()

Unnamed: 0,state,filed_week_ended,initial_claim,reflecting_week_ended,continued_claims,covered_employment,insured_unemployment_rate,File_Month,File_Day,File_Year
0,Alabama,2015-01-03,7785.0,2014-12-27,30194.0,1794067.0,1.68,1,3,2015
1,Alabama,2015-01-10,9139.0,2015-01-03,35246.0,1798083.0,1.96,1,10,2015
2,Alabama,2015-01-17,4654.0,2015-01-10,28200.0,1798083.0,1.57,1,17,2015
3,Alabama,2015-01-24,3284.0,2015-01-17,26222.0,1798083.0,1.46,1,24,2015
4,Alabama,2015-01-31,3203.0,2015-01-24,25391.0,1798083.0,1.41,1,31,2015


In [17]:
year_df = df.groupby(['File_Year'])
year_df.count().head(10)

Unnamed: 0_level_0,state,filed_week_ended,initial_claim,reflecting_week_ended,continued_claims,covered_employment,insured_unemployment_rate,File_Month,File_Day
File_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015,2756,2756,2756,2756,2756,2756,2756,2756,2756
2016,2809,2809,2809,2809,2809,2809,2809,2809,2809
2017,2756,2756,2756,2756,2756,2756,2756,2756,2756
2018,2756,2756,2756,2756,2756,2756,2756,2756,2756
2019,2756,2756,2756,2756,2756,2756,2756,2756,2756
2020,1060,1060,1060,1060,1060,1060,1060,1060,1060


In [None]:
claims_sum = year_df['initial_claim'].sum()
print(claims_sum)

In [None]:
bar_df = 