In [1]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [2]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
unemp_data = query_job.to_dataframe()
unemp_data.head(5)

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0,,14.0,...,,,0,0,0.0,11.0,0.0,0.0,,
1,21957d5517323845818d87623589e1ba,20220319,12,18089,10400,"Census Tract 104, Lake County, Indiana",111,0,,108.0,...,76.0,35.0,0,0,0.0,,,0.0,0.0,0.0
2,6a5609f385912113b6f1014b958ed748,20220326,13,18089,11500,"Census Tract 115, Lake County, Indiana",39,0,,,...,,,0,0,0.0,39.0,0.0,0.0,0.0,0.0
3,46b2882ec4c373527ec33f7bd4f1388d,20220716,29,18089,20700,"Census Tract 207, Lake County, Indiana",14,0,,,...,,,0,0,,10.0,0.0,0.0,,
4,37495d17e82f7df326bfc2c4c090f7b7,20220409,15,18089,21900,"Census Tract 219, Lake County, Indiana",155,0,,69.0,...,90.0,65.0,0,0,0.0,135.0,,,,


In [3]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
wage_data = query_job.to_dataframe()
wage_data.head()

Unnamed: 0,uu_id,countyfips,tract,tract_name,average_wage
0,585f8731c2255d6b3f817a31180848b9,18177,200,"Census Tract 2, Wayne County, Indiana",6612.0
1,8c9d2aa90948679972a9382aadcc6001,18177,900,"Census Tract 9, Wayne County, Indiana",9883.25
2,0f3d45341a5b113b813ffb7be7f58bab,18183,50300,"Census Tract 503, Whitley County, Indiana",13992.25
3,fb55464f8e34af6d750d06968bf719b8,18183,50400,"Census Tract 504, Whitley County, Indiana",13613.5
4,983badfd7b568728e39a2344a9006078,18001,30200,"Census Tract 302, Adams County, Indiana",11816.666667


In [4]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
pred_data = query_job.to_dataframe()
pred_data.head()

Unnamed: 0,uu_id,week_number
0,5bf51fc2e162d6faf9e3cf79e4198378,44
1,420b44cc7e3f55d738df565421e59941,44
2,e39c66ecceec76ee8f9f811fa4a2d246,44
3,a90462cd11ae4e43144239bf7c4828a4,44
4,8b20a6749088c7ff1237983076ebfeaa,44


In [5]:
unemp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16833 entries, 0 to 16832
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uu_id                   16833 non-null  object
 1   timeperiod              16833 non-null  Int64 
 2   week_number             16833 non-null  Int64 
 3   countyfips              16833 non-null  Int64 
 4   tract                   16833 non-null  Int64 
 5   tract_name              16833 non-null  object
 6   total_claims            16833 non-null  Int64 
 7   edu_8th_or_less         13748 non-null  Int64 
 8   edu_grades_9_11         5942 non-null   Int64 
 9   edu_hs_grad_equiv       6036 non-null   Int64 
 10  edu_post_hs             3246 non-null   Int64 
 11  edu_unknown             12031 non-null  Int64 
 12  top_category_employer1  16833 non-null  object
 13  top_category_employer2  16833 non-null  object
 14  top_category_employer3  16833 non-null  object
 15  ge

In [6]:
wage_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525 entries, 0 to 524
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   uu_id         525 non-null    object 
 1   countyfips    525 non-null    Int64  
 2   tract         525 non-null    Int64  
 3   tract_name    525 non-null    object 
 4   average_wage  525 non-null    float64
dtypes: Int64(2), float64(1), object(2)
memory usage: 21.7+ KB


In [7]:
unemp_data.isna().sum()

uu_id                         0
timeperiod                    0
week_number                   0
countyfips                    0
tract                         0
tract_name                    0
total_claims                  0
edu_8th_or_less            3085
edu_grades_9_11           10891
edu_hs_grad_equiv         10797
edu_post_hs               13587
edu_unknown                4802
top_category_employer1        0
top_category_employer2        0
top_category_employer3        0
gender_female             12998
gender_male               12902
gender_na                   965
race_amerindian            1548
race_asian                 1894
race_black                 9709
race_noanswer              6744
race_hawaiiannative         407
race_other                 8758
race_white                10441
dtype: int64

In [8]:
set1 = set(list(unemp_data['top_category_employer1'].unique()))
set1

{'11',
 '21',
 '22',
 '23',
 '31-33',
 '42',
 '44-45',
 '48-49',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '61',
 '62',
 '71',
 '72',
 '81',
 '92',
 '99'}

In [9]:
set2 = set(list(unemp_data['top_category_employer2'].unique()))
set2

{'11',
 '21',
 '22',
 '23',
 '31-33',
 '42',
 '44-45',
 '48-49',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '61',
 '62',
 '71',
 '72',
 '81',
 '92',
 '99',
 'N/A'}

In [10]:
set3 = set(list(unemp_data['top_category_employer3'].unique()))
set3

{'11',
 '21',
 '22',
 '23',
 '31-33',
 '42',
 '44-45',
 '48-49',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '61',
 '62',
 '71',
 '72',
 '81',
 '92',
 '99',
 'N/A'}

In [11]:
unemp_data['top_category_employer2'] = unemp_data['top_category_employer2'].replace('N/A',np.NaN)
unemp_data['top_category_employer3'] = unemp_data['top_category_employer3'].replace('N/A',np.NaN)

In [12]:
unemp_data['countyfips'].unique()

<IntegerArray>
[18003, 18089, 18067, 18039, 18095, 18033, 18035, 18103, 18065, 18173, 18097,
 18091, 18109, 18149, 18099, 18019, 18141, 18157, 18127, 18163, 18093, 18087,
 18085, 18081, 18057, 18063, 18053, 18167, 18177, 18021, 18105, 18001, 18043,
 18017, 18083, 18159, 18011, 18143, 18113, 18073, 18169, 18183, 18049, 18133,
 18059, 18153, 18061, 18023, 18041, 18015, 18135, 18047, 18165, 18077, 18009,
 18005, 18069, 18071, 18079, 18151]
Length: 60, dtype: Int64

In [13]:
(unemp_data.isna().sum()/len(unemp_data))*100

uu_id                      0.000000
timeperiod                 0.000000
week_number                0.000000
countyfips                 0.000000
tract                      0.000000
tract_name                 0.000000
total_claims               0.000000
edu_8th_or_less           18.327096
edu_grades_9_11           64.700291
edu_hs_grad_equiv         64.141864
edu_post_hs               80.716450
edu_unknown               28.527298
top_category_employer1     0.000000
top_category_employer2     0.291095
top_category_employer3     1.627755
gender_female             77.217371
gender_male               76.647062
gender_na                  5.732787
race_amerindian            9.196222
race_asian                11.251708
race_black                57.678370
race_noanswer             40.064160
race_hawaiiannative        2.417870
race_other                52.028753
race_white                62.026971
dtype: float64

In [14]:
(unemp_data.isna().sum()/len(unemp_data))*100

uu_id                      0.000000
timeperiod                 0.000000
week_number                0.000000
countyfips                 0.000000
tract                      0.000000
tract_name                 0.000000
total_claims               0.000000
edu_8th_or_less           18.327096
edu_grades_9_11           64.700291
edu_hs_grad_equiv         64.141864
edu_post_hs               80.716450
edu_unknown               28.527298
top_category_employer1     0.000000
top_category_employer2     0.291095
top_category_employer3     1.627755
gender_female             77.217371
gender_male               76.647062
gender_na                  5.732787
race_amerindian            9.196222
race_asian                11.251708
race_black                57.678370
race_noanswer             40.064160
race_hawaiiannative        2.417870
race_other                52.028753
race_white                62.026971
dtype: float64

In [15]:
unemp_data['edu_8th_or_less'] = unemp_data.groupby('countyfips')['edu_8th_or_less'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['edu_grades_9_11'] = unemp_data.groupby('countyfips')['edu_grades_9_11'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['edu_hs_grad_equiv'] = unemp_data.groupby('countyfips')['edu_hs_grad_equiv'].transform(lambda x: x.fillna(int(x.mean())))

In [16]:
unemp_data['race_asian'] = unemp_data.groupby('countyfips')['race_asian'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['race_amerindian'] = unemp_data.groupby('countyfips')['race_amerindian'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['race_black'] = unemp_data.groupby('countyfips')['race_black'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['race_hawaiiannative'] = unemp_data.groupby('countyfips')['race_hawaiiannative'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['race_other'] = unemp_data.groupby('countyfips')['race_other'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['race_white'] = unemp_data.groupby('countyfips')['race_white'].transform(lambda x: x.fillna(int(x.mean())))

In [17]:
unemp_data['race_noanswer'] = unemp_data['race_noanswer'].fillna(0)
unemp_data['gender_na'] = unemp_data['gender_na'].fillna(0)
unemp_data['edu_unknown'] = unemp_data['edu_unknown'].fillna(0)

In [18]:
unemp_data['top_category_employer2'] = unemp_data.groupby('countyfips')['top_category_employer2'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'unknown'))
unemp_data['top_category_employer3'] = unemp_data.groupby('countyfips')['top_category_employer3'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'unknown'))

In [19]:
unemp_data['gender_male'] = unemp_data['gender_male'].astype(float)
unemp_data['gender_female'] = unemp_data['gender_female'].astype(float)
unemp_data['edu_post_hs'] = unemp_data['edu_post_hs'].astype(float)

In [20]:
unemp_data['gender_male'] = unemp_data['gender_male'].fillna(unemp_data.groupby('countyfips')['gender_male'].transform('mean'))
unemp_data['gender_female'] = unemp_data['gender_female'].fillna(unemp_data.groupby('countyfips')['gender_female'].transform('mean'))
unemp_data['edu_post_hs'] = unemp_data['edu_post_hs'].fillna(unemp_data.groupby('countyfips')['edu_post_hs'].transform('mean'))

In [21]:
unemp_data.isna().sum()

uu_id                       0
timeperiod                  0
week_number                 0
countyfips                  0
tract                       0
tract_name                  0
total_claims                0
edu_8th_or_less             0
edu_grades_9_11             0
edu_hs_grad_equiv           0
edu_post_hs               124
edu_unknown                 0
top_category_employer1      0
top_category_employer2      0
top_category_employer3      0
gender_female             113
gender_male               113
gender_na                   0
race_amerindian             0
race_asian                  0
race_black                  0
race_noanswer               0
race_hawaiiannative         0
race_other                  0
race_white                  0
dtype: int64

In [22]:
unemp_data[unemp_data.gender_male.isnull()]

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white
540,420b44cc7e3f55d738df565421e59941,20220108,2,18043,70400,"Census Tract 704, Floyd County, Indiana",15,0,3,11,...,,,0,0,0,0,0,0,0,12
541,420b44cc7e3f55d738df565421e59941,20220115,3,18043,70400,"Census Tract 704, Floyd County, Indiana",11,0,3,11,...,,,0,0,0,0,0,0,0,13
558,d7a8af51ca8eb58392ab833bc0ae516b,20220312,11,18083,955200,"Census Tract 9552, Knox County, Indiana",16,0,0,11,...,,,0,0,0,0,0,0,0,16
559,d7a8af51ca8eb58392ab833bc0ae516b,20220212,7,18083,955200,"Census Tract 9552, Knox County, Indiana",15,0,0,11,...,,,0,0,0,0,0,0,0,15
766,4a6ae3b5be1b9b7c10c177e1b9fded82,20220205,6,18169,102800,"Census Tract 1028, Wabash County, Indiana",16,0,0,11,...,,,0,0,0,0,0,0,0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16347,ce8b2c0e11c538921c39c06b298d2393,20220730,31,18043,70901,"Census Tract 709.01, Floyd County, Indiana",13,0,0,11,...,,,0,0,0,0,0,0,0,13
16348,ce8b2c0e11c538921c39c06b298d2393,20220716,29,18043,70901,"Census Tract 709.01, Floyd County, Indiana",12,0,3,11,...,,,0,0,0,0,0,0,0,13
16431,6b22ecccaddde4516a88c6981373daf3,20220319,12,18135,951900,"Census Tract 9519, Randolph County, Indiana",10,0,0,10,...,,,0,0,0,0,0,0,0,10
16432,6b22ecccaddde4516a88c6981373daf3,20220219,8,18135,951900,"Census Tract 9519, Randolph County, Indiana",11,0,0,10,...,,,0,0,0,0,0,0,0,11


In [23]:
unemp_data['gender_male'] = unemp_data['gender_male'].fillna(unemp_data.groupby('countyfips')['gender_male'].transform('mean'))
unemp_data['gender_female'] = unemp_data['gender_female'].fillna(unemp_data.groupby('countyfips')['gender_female'].transform('mean'))
unemp_data['edu_post_hs'] = unemp_data['edu_post_hs'].fillna(unemp_data.groupby('countyfips')['edu_post_hs'].transform('mean'))

In [24]:
unemp_data.isna().sum()

uu_id                       0
timeperiod                  0
week_number                 0
countyfips                  0
tract                       0
tract_name                  0
total_claims                0
edu_8th_or_less             0
edu_grades_9_11             0
edu_hs_grad_equiv           0
edu_post_hs               124
edu_unknown                 0
top_category_employer1      0
top_category_employer2      0
top_category_employer3      0
gender_female             113
gender_male               113
gender_na                   0
race_amerindian             0
race_asian                  0
race_black                  0
race_noanswer               0
race_hawaiiannative         0
race_other                  0
race_white                  0
dtype: int64

In [25]:
unemp_data['gender_male'] = unemp_data.groupby('countyfips')['gender_male'].transform(lambda x: x.fillna(x.mean()))
unemp_data['gender_female'] = unemp_data.groupby('countyfips')['gender_female'].transform(lambda x: x.fillna(x.mean()))
unemp_data['edu_post_hs'] = unemp_data.groupby('countyfips')['edu_post_hs'].transform(lambda x: x.fillna(x.mean()))

In [26]:
unemp_data.isna().sum()

uu_id                       0
timeperiod                  0
week_number                 0
countyfips                  0
tract                       0
tract_name                  0
total_claims                0
edu_8th_or_less             0
edu_grades_9_11             0
edu_hs_grad_equiv           0
edu_post_hs               124
edu_unknown                 0
top_category_employer1      0
top_category_employer2      0
top_category_employer3      0
gender_female             113
gender_male               113
gender_na                   0
race_amerindian             0
race_asian                  0
race_black                  0
race_noanswer               0
race_hawaiiannative         0
race_other                  0
race_white                  0
dtype: int64

In [27]:
unemp_data.isnull()

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16829,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16830,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16831,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [28]:
unemp_data[unemp_data['gender_male'].isnull()]

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white
540,420b44cc7e3f55d738df565421e59941,20220108,2,18043,70400,"Census Tract 704, Floyd County, Indiana",15,0,3,11,...,,,0,0,0,0,0,0,0,12
541,420b44cc7e3f55d738df565421e59941,20220115,3,18043,70400,"Census Tract 704, Floyd County, Indiana",11,0,3,11,...,,,0,0,0,0,0,0,0,13
558,d7a8af51ca8eb58392ab833bc0ae516b,20220312,11,18083,955200,"Census Tract 9552, Knox County, Indiana",16,0,0,11,...,,,0,0,0,0,0,0,0,16
559,d7a8af51ca8eb58392ab833bc0ae516b,20220212,7,18083,955200,"Census Tract 9552, Knox County, Indiana",15,0,0,11,...,,,0,0,0,0,0,0,0,15
766,4a6ae3b5be1b9b7c10c177e1b9fded82,20220205,6,18169,102800,"Census Tract 1028, Wabash County, Indiana",16,0,0,11,...,,,0,0,0,0,0,0,0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16347,ce8b2c0e11c538921c39c06b298d2393,20220730,31,18043,70901,"Census Tract 709.01, Floyd County, Indiana",13,0,0,11,...,,,0,0,0,0,0,0,0,13
16348,ce8b2c0e11c538921c39c06b298d2393,20220716,29,18043,70901,"Census Tract 709.01, Floyd County, Indiana",12,0,3,11,...,,,0,0,0,0,0,0,0,13
16431,6b22ecccaddde4516a88c6981373daf3,20220319,12,18135,951900,"Census Tract 9519, Randolph County, Indiana",10,0,0,10,...,,,0,0,0,0,0,0,0,10
16432,6b22ecccaddde4516a88c6981373daf3,20220219,8,18135,951900,"Census Tract 9519, Randolph County, Indiana",11,0,0,10,...,,,0,0,0,0,0,0,0,11


In [29]:
unemp_data['gender_male'] = unemp_data['gender_male'].fillna(0)
unemp_data['gender_female'] = unemp_data['gender_female'].fillna(0)
unemp_data['edu_post_hs'] = unemp_data['edu_post_hs'].fillna(0)

In [30]:
unemp_data.isna().sum()

uu_id                     0
timeperiod                0
week_number               0
countyfips                0
tract                     0
tract_name                0
total_claims              0
edu_8th_or_less           0
edu_grades_9_11           0
edu_hs_grad_equiv         0
edu_post_hs               0
edu_unknown               0
top_category_employer1    0
top_category_employer2    0
top_category_employer3    0
gender_female             0
gender_male               0
gender_na                 0
race_amerindian           0
race_asian                0
race_black                0
race_noanswer             0
race_hawaiiannative       0
race_other                0
race_white                0
dtype: int64

In [31]:
unemp_data.isna().sum()

uu_id                     0
timeperiod                0
week_number               0
countyfips                0
tract                     0
tract_name                0
total_claims              0
edu_8th_or_less           0
edu_grades_9_11           0
edu_hs_grad_equiv         0
edu_post_hs               0
edu_unknown               0
top_category_employer1    0
top_category_employer2    0
top_category_employer3    0
gender_female             0
gender_male               0
gender_na                 0
race_amerindian           0
race_asian                0
race_black                0
race_noanswer             0
race_hawaiiannative       0
race_other                0
race_white                0
dtype: int64

In [32]:
wage_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525 entries, 0 to 524
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   uu_id         525 non-null    object 
 1   countyfips    525 non-null    Int64  
 2   tract         525 non-null    Int64  
 3   tract_name    525 non-null    object 
 4   average_wage  525 non-null    float64
dtypes: Int64(2), float64(1), object(2)
memory usage: 21.7+ KB


In [33]:
wage_data.describe()

Unnamed: 0,countyfips,tract,average_wage
count,525.0,525.0,525.0
mean,18087.15619,236688.508571,11864.620446
std,41.969982,323355.706854,4334.564054
min,18001.0,100.0,3992.5
25%,18059.0,10200.0,8797.6
50%,18093.0,42902.0,11140.333333
75%,18103.0,353600.0,14241.727273
max,18183.0,976400.0,41267.0


In [34]:
wage_data['average_wage'] = wage_data['average_wage'].fillna(wage_data.groupby('countyfips')['average_wage'].transform('mean'))

In [35]:
wage_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525 entries, 0 to 524
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   uu_id         525 non-null    object 
 1   countyfips    525 non-null    Int64  
 2   tract         525 non-null    Int64  
 3   tract_name    525 non-null    object 
 4   average_wage  525 non-null    float64
dtypes: Int64(2), float64(1), object(2)
memory usage: 21.7+ KB


In [36]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
wage_data = query_job.to_dataframe()
wage_data.head()

Unnamed: 0,uu_id,countyfips,tract,tract_name,average_wage
0,585f8731c2255d6b3f817a31180848b9,18177,200,"Census Tract 2, Wayne County, Indiana",6612.0
1,8c9d2aa90948679972a9382aadcc6001,18177,900,"Census Tract 9, Wayne County, Indiana",9883.25
2,0f3d45341a5b113b813ffb7be7f58bab,18183,50300,"Census Tract 503, Whitley County, Indiana",13992.25
3,fb55464f8e34af6d750d06968bf719b8,18183,50400,"Census Tract 504, Whitley County, Indiana",13613.5
4,983badfd7b568728e39a2344a9006078,18001,30200,"Census Tract 302, Adams County, Indiana",11816.666667


In [37]:
wage_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525 entries, 0 to 524
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   uu_id         525 non-null    object 
 1   countyfips    525 non-null    Int64  
 2   tract         525 non-null    Int64  
 3   tract_name    525 non-null    object 
 4   average_wage  525 non-null    float64
dtypes: Int64(2), float64(1), object(2)
memory usage: 21.7+ KB


In [38]:
wage_data['average_wage'] = wage_data['average_wage'].fillna(wage_data.groupby('countyfips')['average_wage'].transform('mean'))

In [39]:
final_data = unemp_data.merge(wage_data,how='left', on = 'uu_id)
final_data

SyntaxError: EOL while scanning string literal (1578303965.py, line 1)

In [40]:
final_data = unemp_data.merge(wage_data,how='left', on = 'uu_id')
final_data

Unnamed: 0,uu_id,timeperiod,week_number,countyfips_x,tract_x,tract_name_x,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,countyfips_y,tract_y,tract_name_y,average_wage
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0,1,14,...,0,11,0,0,0,11,18003,900,"Census Tract 9, Allen County, Indiana",8347.125000
1,21957d5517323845818d87623589e1ba,20220319,12,18089,10400,"Census Tract 104, Lake County, Indiana",111,0,0,108,...,0,20,0,0,0,0,18089,10400,"Census Tract 104, Lake County, Indiana",7036.636364
2,6a5609f385912113b6f1014b958ed748,20220326,13,18089,11500,"Census Tract 115, Lake County, Indiana",39,0,0,22,...,0,39,0,0,0,0,18089,11500,"Census Tract 115, Lake County, Indiana",7890.142857
3,46b2882ec4c373527ec33f7bd4f1388d,20220716,29,18089,20700,"Census Tract 207, Lake County, Indiana",14,0,0,22,...,0,10,0,0,1,11,18089,20700,"Census Tract 207, Lake County, Indiana",7534.375000
4,37495d17e82f7df326bfc2c4c090f7b7,20220409,15,18089,21900,"Census Tract 219, Lake County, Indiana",155,0,0,69,...,0,135,0,0,1,11,18089,21900,"Census Tract 219, Lake County, Indiana",11825.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,f35320206f3dd1ffc411e4ac127caf92,20220618,25,18163,3804,"Census Tract 38.04, Vanderburgh County, Indiana",21,0,1,18,...,0,6,0,0,0,14,18163,3804,"Census Tract 38.04, Vanderburgh County, Indiana",13163.083333
16829,f35320206f3dd1ffc411e4ac127caf92,20220827,35,18163,3804,"Census Tract 38.04, Vanderburgh County, Indiana",22,0,1,13,...,0,6,0,0,0,15,18163,3804,"Census Tract 38.04, Vanderburgh County, Indiana",13163.083333
16830,f35320206f3dd1ffc411e4ac127caf92,20220827,35,18163,3804,"Census Tract 38.04, Vanderburgh County, Indiana",22,0,1,13,...,0,6,0,0,0,15,18163,3804,"Census Tract 38.04, Vanderburgh County, Indiana",13163.083333
16831,bbcb018f0e5e49e13636f6e78ce9f60f,20220326,13,18163,10203,"Census Tract 102.03, Vanderburgh County, Indiana",53,0,1,48,...,0,6,0,0,0,15,18163,10203,"Census Tract 102.03, Vanderburgh County, Indiana",10040.111111


In [41]:
final_data = unemp_data.merge(wage_data,how='left', on = 'uu_id')

In [42]:
final_data1 = final_data.drop(['timeperiod','tract','tract_name'],axis=1)

KeyError: "['tract', 'tract_name'] not found in axis"

In [43]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16833 entries, 0 to 16832
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uu_id                   16833 non-null  object 
 1   timeperiod              16833 non-null  Int64  
 2   week_number             16833 non-null  Int64  
 3   countyfips_x            16833 non-null  Int64  
 4   tract_x                 16833 non-null  Int64  
 5   tract_name_x            16833 non-null  object 
 6   total_claims            16833 non-null  Int64  
 7   edu_8th_or_less         16833 non-null  Int64  
 8   edu_grades_9_11         16833 non-null  Int64  
 9   edu_hs_grad_equiv       16833 non-null  Int64  
 10  edu_post_hs             16833 non-null  float64
 11  edu_unknown             16833 non-null  Int64  
 12  top_category_employer1  16833 non-null  object 
 13  top_category_employer2  16833 non-null  object 
 14  top_category_employer3  16833 non-null

In [44]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

#Modelling

In [45]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y','uu_id'],axis=1)

In [46]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [47]:
fin.shape()

TypeError: 'tuple' object is not callable

In [48]:
fin.shape

(16833, 175)

In [49]:
X = fin.drop(['total_claims'],axis=1)
Y = fin['total_claims']

In [50]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
principalComponents = pca.fit_transform(X)

In [51]:
principalComponents

array([[-3.34822986e+03, -2.64007018e+00, -3.16860568e+00, ...,
         3.23264050e-01, -8.69879363e-02,  1.23030381e-01],
       [-4.65874176e+03,  6.55827964e+01,  3.57040468e+01, ...,
        -3.41617600e-01, -1.15859843e-02, -1.30081905e-02],
       [-3.80523585e+03,  2.39696662e+01, -1.79858245e+01, ...,
        -5.18290323e-02,  1.21413854e-01,  8.34402708e-03],
       ...,
       [ 1.46773188e+03, -7.22179003e+00, -3.28672487e+00, ...,
        -1.36275043e-01,  3.15814289e-03, -1.31928155e-01],
       [-1.65524425e+03,  5.45629141e+00,  1.17897046e+01, ...,
        -1.63652576e-01, -1.97897868e-01,  1.04909590e-02],
       [-1.65524023e+03, -9.07100561e+00, -1.50430472e+00, ...,
        -8.34723233e-02,  3.77386132e-03,  6.10821776e-02]])

In [52]:
df_X = pd.DataFrame(principalComponents)

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=28)

In [54]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = model.predict(X_test)

NameError: name 'model' is not defined

In [55]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [56]:
lm.score(X_test, Y_test)

0.7275411262156158

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(df_X,Y,test_size = 0.2, random_state=28)

In [58]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [59]:
lm.score(X_test, Y_test)

0.68873167227177

In [60]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
principalComponents = pca.fit_transform(X)

In [61]:
df_X = pd.DataFrame(principalComponents)

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(df_X,Y,test_size = 0.2, random_state=28)

In [63]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [64]:
lm.score(X_test, Y_test)

0.6544127729762637

In [65]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [66]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [67]:
lm.score(X_test, Y_test)

0.6544127729762637

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=28)

In [69]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [70]:
lm.score(X_test, Y_test)

0.7275411262156158

In [71]:
mean_squared_error(Y_train, prediction)

NameError: name 'mean_squared_error' is not defined

In [72]:
lm.mean_squared_error(Y_train, prediction)

AttributeError: 'LinearRegression' object has no attribute 'mean_squared_error'

In [73]:
model = XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

NameError: name 'XGBRegressor' is not defined

In [74]:
import xgboost as xgb

In [75]:
import xgboost as xgb

In [76]:
get_ipython().system('pip install xgboost')





In [77]:
import xgboost as xgb

In [78]:
model = XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

NameError: name 'XGBRegressor' is not defined

In [79]:
model = xgb.XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [80]:
model.score(X_test, Y_test)

0.9108049625244061

In [81]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [82]:
model.score(X_test, Y_test)

0.9108049625244061

In [83]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=400, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [84]:
model.score(X_test, Y_test)

0.9071078207301947

In [85]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=600, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [86]:
model.score(X_test, Y_test)

0.913941243290783

In [87]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=28)

In [88]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=600, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [89]:
model.score(X_test, Y_test)

0.913941243290783

In [90]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [91]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=600, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [92]:
model.score(X_test, Y_test)

0.9102761242741739

In [93]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [94]:
model.score(X_test, Y_test)

0.9066691665646758

In [95]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [96]:
ttt = model.fit(pred_data)

TypeError: fit() missing 1 required positional argument: 'y'

In [97]:
pred = model.predict(pred_data)

ValueError: Feature shape mismatch, expected: 174, got 2

In [98]:
pred_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525 entries, 0 to 524
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uu_id        525 non-null    object
 1   week_number  525 non-null    Int64 
dtypes: Int64(1), object(1)
memory usage: 8.8+ KB


In [99]:
unemp_data.describe()

Unnamed: 0,timeperiod,week_number,countyfips,tract,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white
count,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0
mean,20220540.093329,21.329531,18088.919682,211605.900315,22.952712,0.065348,1.340462,18.376879,16.171548,0.094339,17.669458,18.869913,0.005228,0.009921,0.012475,11.101705,0.124161,0.0,0.446801,16.257233
std,268.366832,11.618554,41.019467,288715.02492,14.583655,1.03561,3.451816,8.139188,5.892916,1.499568,6.761315,6.507586,0.441002,0.469947,0.535979,12.039616,1.947116,0.0,2.40954,7.670217
min,20220101.0,1.0,18001.0,100.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20220312.0,11.0,18063.0,10100.0,14.0,0.0,0.0,14.0,14.539683,0.0,15.032258,16.550898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0
50%,20220528.0,22.0,18095.0,42801.0,18.0,0.0,0.0,18.0,17.219048,0.0,17.381818,18.784314,0.0,0.0,0.0,7.0,0.0,0.0,0.0,15.0
75%,20220806.0,32.0,18103.0,342600.0,27.0,0.0,2.0,21.0,17.775814,0.0,20.487537,20.147059,0.0,0.0,0.0,20.0,0.0,0.0,0.0,18.0
max,20220910.0,37.0,18183.0,976400.0,170.0,41.0,78.0,137.0,154.0,60.0,126.0,139.0,49.0,36.0,41.0,163.0,110.0,0.0,74.0,155.0


In [100]:
tt = pred_data.merge(fin,how = 'left',on= 'uu_id')
tt                     

KeyError: 'uu_id'

In [101]:
tt = pred_data.merge(fin,how = 'left', final_data= 'uu_id')
tt                     

TypeError: merge() got an unexpected keyword argument 'final_data'

In [102]:
tt = pred_data.merge(final_data,how = 'left', on= 'uu_id')
tt                     

Unnamed: 0,uu_id,week_number_x,timeperiod,week_number_y,countyfips_x,tract_x,tract_name_x,total_claims,edu_8th_or_less,edu_grades_9_11,...,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,countyfips_y,tract_y,tract_name_y,average_wage
0,5bf51fc2e162d6faf9e3cf79e4198378,44,20220625,26,18103,952800,"Census Tract 9528, Miami County, Indiana",20,0,0,...,0,0,0,0,0,15,18103,952800,"Census Tract 9528, Miami County, Indiana",5008.5
1,5bf51fc2e162d6faf9e3cf79e4198378,44,20220820,34,18103,952800,"Census Tract 9528, Miami County, Indiana",11,0,0,...,0,0,0,0,0,15,18103,952800,"Census Tract 9528, Miami County, Indiana",5008.5
2,5bf51fc2e162d6faf9e3cf79e4198378,44,20220820,34,18103,952800,"Census Tract 9528, Miami County, Indiana",11,0,0,...,0,0,0,0,0,15,18103,952800,"Census Tract 9528, Miami County, Indiana",5008.5
3,5bf51fc2e162d6faf9e3cf79e4198378,44,20220827,35,18103,952800,"Census Tract 9528, Miami County, Indiana",20,0,0,...,0,0,0,0,0,15,18103,952800,"Census Tract 9528, Miami County, Indiana",5008.5
4,5bf51fc2e162d6faf9e3cf79e4198378,44,20220827,35,18103,952800,"Census Tract 9528, Miami County, Indiana",20,0,0,...,0,0,0,0,0,15,18103,952800,"Census Tract 9528, Miami County, Indiana",5008.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,20220813,33,18091,41500,"Census Tract 415, LaPorte County, Indiana",17,0,0,...,0,0,0,0,0,17,18091,41500,"Census Tract 415, LaPorte County, Indiana",8306.0
16829,932a58530abff8a48558e2a15639d453,44,20220507,19,18091,41500,"Census Tract 415, LaPorte County, Indiana",11,0,0,...,0,0,0,0,0,17,18091,41500,"Census Tract 415, LaPorte County, Indiana",8306.0
16830,932a58530abff8a48558e2a15639d453,44,20220716,29,18091,41500,"Census Tract 415, LaPorte County, Indiana",25,0,15,...,0,0,0,0,0,17,18091,41500,"Census Tract 415, LaPorte County, Indiana",8306.0
16831,932a58530abff8a48558e2a15639d453,44,20220409,15,18091,41500,"Census Tract 415, LaPorte County, Indiana",13,0,0,...,0,7,0,0,0,17,18091,41500,"Census Tract 415, LaPorte County, Indiana",8306.0


In [103]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [104]:
fin = final_data1.set_index('uu_id')

In [105]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [106]:
fin.shape

(16833, 176)

In [107]:
X = fin.drop(['total_claims'],axis=1)
Y = fin['total_claims']

In [108]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [109]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:uu_id: object

In [110]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [111]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [112]:
X = fin.drop(['total_claims'],axis=1)
Y = fin['total_claims']

In [113]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [114]:
X_train_data = X_train.drop(['uu_id'],axis=1)
X_test_data = X_test.drop(['uu_id'],axis=1)

In [115]:
lm = LinearRegression().fit(X_train_data,Y_train)
prediction = lm.predict(X_test_data)

In [116]:
lm.score(X_test_data, Y_test)

0.7231720964615613

In [117]:
fin = fin.set_index('uu_id')
X_train, X_test, y_train, y_test = test_train_split(fin.ix[:, ~fin.columns.isin(['total_claims'])], fin.total_claims)

NameError: name 'test_train_split' is not defined

In [118]:
fin = fin.set_index('uu_id')
X_train, X_test, y_train, y_test = train_test_split(fin.ix[:, ~fin.columns.isin(['total_claims'])], fin.total_claims)

KeyError: "None of ['uu_id'] are in the columns"

In [119]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [120]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [121]:
fin = fin.set_index('uu_id')
X_train, X_test, y_train, y_test = train_test_split(fin.ix[:, ~fin.columns.isin(['total_claims'])], fin.total_claims)

AttributeError: 'DataFrame' object has no attribute 'ix'

In [122]:
fin = fin.set_index('uu_id')
X = fin.drop(['total_claims'],axis=1)
Y=fin['total_claims']
#X_train, X_test, y_train, y_test = train_test_split(fin.ix[:, ~fin.columns.isin(['total_claims'])], fin.total_claims)

KeyError: "None of ['uu_id'] are in the columns"

In [123]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [124]:
fin = fin.set_index('uu_id')
X = fin.drop(['total_claims'],axis=1)
Y=fin['total_claims']
#X_train, X_test, y_train, y_test = train_test_split(fin.ix[:, ~fin.columns.isin(['total_claims'])], fin.total_claims)

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [126]:
lm = LinearRegression().fit(X_train_data,Y_train)
prediction = lm.predict(X_test_data)

In [127]:
lm.score(X_test_data, Y_test)

0.7231720964615613

In [128]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [129]:
lm.score(X_test, Y_test)

0.7231720964615613

In [130]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [131]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [132]:
fin = fin.set_index('uu_id')
X = fin.drop(['total_claims'],axis=1)
Y=fin['total_claims']
#X_train, X_test, y_train, y_test = train_test_split(fin.ix[:, ~fin.columns.isin(['total_claims'])], fin.total_claims)

In [133]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [134]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [135]:
lm.score(X_test, Y_test)

0.7231720964615613

In [136]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [137]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [138]:
X = fin.drop(['uu_id','total_claims'],axis=1)
Y=fin['total_claims']

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [140]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [141]:
lm.score(X_test, Y_test)

0.7231720964615613

In [142]:
final_data = unemp_data.merge(wage_data,how='left', on = 'uu_id')

In [143]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16833 entries, 0 to 16832
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uu_id                   16833 non-null  object 
 1   timeperiod              16833 non-null  Int64  
 2   week_number             16833 non-null  Int64  
 3   countyfips_x            16833 non-null  Int64  
 4   tract_x                 16833 non-null  Int64  
 5   tract_name_x            16833 non-null  object 
 6   total_claims            16833 non-null  Int64  
 7   edu_8th_or_less         16833 non-null  Int64  
 8   edu_grades_9_11         16833 non-null  Int64  
 9   edu_hs_grad_equiv       16833 non-null  Int64  
 10  edu_post_hs             16833 non-null  float64
 11  edu_unknown             16833 non-null  Int64  
 12  top_category_employer1  16833 non-null  object 
 13  top_category_employer2  16833 non-null  object 
 14  top_category_employer3  16833 non-null

In [144]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [145]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [146]:
X = fin.drop(['uu_id','total_claims'],axis=1)
Y=fin['total_claims']

In [147]:
X = fin.drop(['uu_id','total_claims'],axis=1)
Y=fin[['total_claims']]

In [148]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [149]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [150]:
lm.score(X_test, Y_test)

0.7231720964615613

In [151]:
X = fin.drop(['uu_id','total_claims'],axis=1)
Y=fin['total_claims']

In [152]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [153]:
lm = LinearRegression().fit(X_train,Y_train)
prediction = lm.predict(X_test)

In [154]:
lm.score(X_test, Y_test)

0.7231720964615613

In [155]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [156]:
model.score(X_test, Y_test)

0.9066691665646758

In [157]:
df_pred_final = pred_data[['uu_id']]
df_pred_final['week_number'] = pred_data[['week_number']]
df_pred_final["total_claims"] = y_pred

ValueError: Length of values (3367) does not match length of index (525)

In [158]:
Y_test_final = pred_data.merge(fin,how='left',on= 'uu_id')

In [159]:
Y_test_final

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
0,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,11,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
1,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
2,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
3,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
4,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,17,0,0,17,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16829,932a58530abff8a48558e2a15639d453,44,11,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16830,932a58530abff8a48558e2a15639d453,44,25,0,15,10,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16831,932a58530abff8a48558e2a15639d453,44,13,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,1,0,0,0,0


In [160]:
Y_test_final = pred_data.merge(fin,how='inner',on= 'uu_id')

In [161]:
Y_test_final

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
0,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,11,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
1,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
2,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
3,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
4,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,17,0,0,17,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16829,932a58530abff8a48558e2a15639d453,44,11,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16830,932a58530abff8a48558e2a15639d453,44,25,0,15,10,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16831,932a58530abff8a48558e2a15639d453,44,13,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,1,0,0,0,0


In [162]:
Y_test_final

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
0,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,11,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
1,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
2,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
3,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
4,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,17,0,0,17,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16829,932a58530abff8a48558e2a15639d453,44,11,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16830,932a58530abff8a48558e2a15639d453,44,25,0,15,10,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16831,932a58530abff8a48558e2a15639d453,44,13,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,1,0,0,0,0


In [163]:
Y_test_final = pred_data.merge(fin,how='right',on= 'uu_id')

In [164]:
Y_test_final

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
0,f013068de98db1470bd986137a0c6d23,44,22,0,1,14,17.321101,0,19.757282,20.147059,...,0,0,0,0,1,0,0,0,0,0
1,21957d5517323845818d87623589e1ba,44,111,0,0,108,19.138743,0,76.000000,35.000000,...,0,0,1,0,0,0,0,0,0,0
2,6a5609f385912113b6f1014b958ed748,44,39,0,0,22,19.138743,0,22.363248,20.787276,...,0,0,1,0,0,0,0,0,0,0
3,46b2882ec4c373527ec33f7bd4f1388d,44,14,0,0,22,19.138743,0,22.363248,20.787276,...,0,0,0,0,0,0,0,0,0,0
4,37495d17e82f7df326bfc2c4c090f7b7,44,155,0,0,69,83.000000,0,90.000000,65.000000,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,f35320206f3dd1ffc411e4ac127caf92,44,21,0,1,18,12.000000,0,14.853333,16.133333,...,0,0,0,0,0,0,0,1,0,0
16829,f35320206f3dd1ffc411e4ac127caf92,44,22,0,1,13,14.921053,0,14.853333,16.133333,...,0,0,0,0,0,0,0,0,0,0
16830,f35320206f3dd1ffc411e4ac127caf92,44,22,0,1,13,14.921053,0,14.853333,16.133333,...,0,0,0,0,0,0,0,0,0,0
16831,bbcb018f0e5e49e13636f6e78ce9f60f,44,53,0,1,48,14.921053,0,14.853333,16.133333,...,0,0,0,0,0,0,0,0,0,0


In [165]:
Y_test_final = pred_data.merge(fin,how='left',on= 'uu_id')

In [166]:
Y_test_final

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
0,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,11,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
1,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
2,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
3,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
4,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,17,0,0,17,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16829,932a58530abff8a48558e2a15639d453,44,11,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16830,932a58530abff8a48558e2a15639d453,44,25,0,15,10,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16831,932a58530abff8a48558e2a15639d453,44,13,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,1,0,0,0,0


In [167]:
df_test = fin[fin.uu_id == i for i in pred_data['uu_id']]
df_test

SyntaxError: invalid syntax (953381775.py, line 1)

In [168]:
Y_test_final = pred_data.join(fin)

ValueError: columns overlap but no suffix specified: Index(['uu_id'], dtype='object')

In [169]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [170]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [171]:
Y_test_final = pred_data.join(fin)

ValueError: columns overlap but no suffix specified: Index(['uu_id'], dtype='object')

In [172]:
Y_test_final = pred_data.merge(fin,how='left',on = 'uu_id')

In [173]:
Y_test_final

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
0,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,11,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
1,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
2,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
3,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
4,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,17,0,0,17,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16829,932a58530abff8a48558e2a15639d453,44,11,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16830,932a58530abff8a48558e2a15639d453,44,25,0,15,10,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16831,932a58530abff8a48558e2a15639d453,44,13,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,1,0,0,0,0


In [174]:
pred_data = pred_data.set_index('uu_id')
fin = fin.set_index('uu_id)
Y_test_final = pred_data.join(fin)

SyntaxError: EOL while scanning string literal (2086564235.py, line 2)

In [175]:
pred_data = pred_data.set_index('uu_id')
fin = fin.set_index('uu_id')
Y_test_final = pred_data.join(fin)

In [176]:
Y_test_final

Unnamed: 0_level_0,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,gender_na,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
uu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001cd9ae23064d7f0fd3cd327c873d8d,44,13,0,0,14,12.166667,0,7.500000,19.187500,0,...,0,0,0,0,0,0,0,1,0,0
001cd9ae23064d7f0fd3cd327c873d8d,44,10,0,0,14,12.166667,0,7.500000,19.187500,0,...,0,0,0,0,0,0,0,1,0,0
001cd9ae23064d7f0fd3cd327c873d8d,44,10,0,0,14,12.166667,0,7.500000,19.187500,0,...,0,0,0,0,0,0,0,1,0,0
001cd9ae23064d7f0fd3cd327c873d8d,44,13,0,0,14,12.166667,0,7.500000,19.187500,0,...,0,0,0,0,0,0,0,1,0,0
001cd9ae23064d7f0fd3cd327c873d8d,44,10,0,0,14,12.166667,0,7.500000,19.187500,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fec479d0202d6e1e3f051a9ee902ff5d,44,26,0,0,16,19.138743,0,22.363248,20.787276,0,...,0,0,0,0,0,0,0,0,0,0
fec479d0202d6e1e3f051a9ee902ff5d,44,30,0,0,19,19.138743,0,22.363248,20.787276,0,...,0,0,1,0,0,0,0,0,0,0
fec479d0202d6e1e3f051a9ee902ff5d,44,34,0,0,18,12.000000,0,10.000000,24.000000,0,...,0,0,0,0,0,0,0,0,0,0
fec479d0202d6e1e3f051a9ee902ff5d,44,21,0,0,22,19.138743,0,22.363248,20.787276,0,...,0,0,0,0,0,0,0,0,0,0


In [177]:
X = fin.drop(['uu_id','total_claims'],axis=1)
Y=fin['total_claims']

KeyError: "['uu_id'] not found in axis"

In [178]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y'],axis=1)

In [179]:
fin = pd.get_dummies(final_data1,columns = ['week_number','countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [180]:
X = fin.drop(['uu_id','total_claims'],axis=1)
Y=fin['total_claims']

In [181]:
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [182]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [183]:
df_test = pred_data.merge(fin,how='left',on= 'uu_id')

In [184]:
X_df_test = df_test.drop(['uu_id','total_claims'],axis=1)
Y_df_test = df_test['total_claims']

In [185]:
prediction = model.predict(X_df_test)

ValueError: Feature shape mismatch, expected: 174, got 175

In [186]:
df_test

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,...,top_category_employer3_54,top_category_employer3_55,top_category_employer3_56,top_category_employer3_61,top_category_employer3_62,top_category_employer3_71,top_category_employer3_72,top_category_employer3_81,top_category_employer3_92,top_category_employer3_99
0,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,11,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
1,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
2,5bf51fc2e162d6faf9e3cf79e4198378,44,11,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
3,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
4,5bf51fc2e162d6faf9e3cf79e4198378,44,20,0,0,13,9.166667,0,17.416667,14.800000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,17,0,0,17,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16829,932a58530abff8a48558e2a15639d453,44,11,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16830,932a58530abff8a48558e2a15639d453,44,25,0,15,10,0.000000,0,17.859649,17.985075,...,0,0,0,0,0,0,0,1,0,0
16831,932a58530abff8a48558e2a15639d453,44,13,0,0,17,14.539683,0,17.859649,17.985075,...,0,0,0,0,0,1,0,0,0,0


In [187]:
final_data1 = final_data.drop(['timeperiod','tract_x','tract_name_x','tract_y','tract_name_y','countyfips_y','week_number'],axis=1)

In [188]:
fin = pd.get_dummies(final_data1,columns = ['countyfips_x','top_category_employer1','top_category_employer2','top_category_employer3'])

In [189]:
X = fin.drop(['uu_id','total_claims'],axis=1)
Y=fin['total_claims']

In [190]:
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=32)

In [191]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=500, max_depth=4,eta=0.1).fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [192]:
model.score(X_test, Y_test)

0.9108679506131394

In [193]:
df_test = pred_data.merge(fin,how='left',on= 'uu_id')

In [194]:
X_df_test = df_test.drop(['uu_id','week_number','total_claims'],axis=1)
Y_df_test = df_test['total_claims']

In [195]:
prediction = model.predict(X_df_test)

In [196]:
model.score(X_df_test,Y_df_test)

0.933632854698625

In [197]:
df_pred_final = pred_data[['uu_id']]
df_pred_final['week_number'] = pred_data[['week_number']]
df_pred_final["total_claims"] = prediction

KeyError: "None of [Index(['uu_id'], dtype='object')] are in the [columns]"

In [198]:
df_pred_final = pred_data[['uu_id']]

KeyError: "None of [Index(['uu_id'], dtype='object')] are in the [columns]"

In [199]:
df_pred_final = pred_data['uu_id']

KeyError: 'uu_id'

In [200]:
df_pred_final['uu_id'] = pred_data['uu_id']

KeyError: 'uu_id'

In [201]:
df_pred_final['uu_id'] = pred_data[['uu_id']]

KeyError: "None of [Index(['uu_id'], dtype='object')] are in the [columns]"

In [202]:
df_pred_final1['uu_id'] = pred_data[['uu_id']]

KeyError: "None of [Index(['uu_id'], dtype='object')] are in the [columns]"

In [203]:
df_pred_final2 = pred_data[['uu_id']]

KeyError: "None of [Index(['uu_id'], dtype='object')] are in the [columns]"

In [204]:
df_pred_final['week_number'] = pred_data[['week_number']]
df_pred_final["total_claims"] = prediction

ValueError: Length of values (16833) does not match length of index (525)

In [205]:
df_pred_final_yams = pred_data[['uu_id']]

KeyError: "None of [Index(['uu_id'], dtype='object')] are in the [columns]"

In [206]:
df_pred_final_yams == pred_data[['uu_id']]

NameError: name 'df_pred_final_yams' is not defined

In [207]:
df_pred_final_yams=[]
df_pred_final_yams == pred_data[['uu_id']]

KeyError: "None of [Index(['uu_id'], dtype='object')] are in the [columns]"

In [208]:
df_pred_final_1 = df_test[['uu_id']]

In [209]:
df_pred_final_1['week_number'] = df_test[['week_number']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred_final_1['week_number'] = df_test[['week_number']]


In [210]:
df_pred_final_1['total_claims'] = prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred_final_1['total_claims'] = prediction


In [211]:
df_pred_final_1

Unnamed: 0,uu_id,week_number,total_claims
0,5bf51fc2e162d6faf9e3cf79e4198378,44,16.201878
1,5bf51fc2e162d6faf9e3cf79e4198378,44,16.052597
2,5bf51fc2e162d6faf9e3cf79e4198378,44,16.052597
3,5bf51fc2e162d6faf9e3cf79e4198378,44,16.083286
4,5bf51fc2e162d6faf9e3cf79e4198378,44,16.083286
...,...,...,...
16828,932a58530abff8a48558e2a15639d453,44,14.932419
16829,932a58530abff8a48558e2a15639d453,44,15.301217
16830,932a58530abff8a48558e2a15639d453,44,24.040972
16831,932a58530abff8a48558e2a15639d453,44,15.256616


In [212]:
df_final = df_pred_final_1.drop_duplicates(subset=['uu_id'])

In [213]:
df_final

Unnamed: 0,uu_id,week_number,total_claims
0,5bf51fc2e162d6faf9e3cf79e4198378,44,16.201878
10,420b44cc7e3f55d738df565421e59941,44,11.360290
18,e39c66ecceec76ee8f9f811fa4a2d246,44,14.550637
32,a90462cd11ae4e43144239bf7c4828a4,44,31.378025
58,8b20a6749088c7ff1237983076ebfeaa,44,14.060472
...,...,...,...
16684,46c4f6c75e663b1ca82ea7994e6d83d3,44,14.884317
16722,1deebda501712e7595b531b8337bc31a,44,17.802004
16757,5a9758f65f001b6432ff31ff64a459d7,44,24.659317
16781,e8b3b95e93a6dc7dbb90f4e72e7ac065,44,16.755768


In [214]:
fin.shape

(16833, 141)

In [215]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [216]:
df_final.to_csv('submission_prediction_output.csv')

In [217]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")