In [1]:
import db_dtypes
import matplotlib.pyplot as plt
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np
import seaborn as sns

In [2]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [3]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""

In [4]:
query_job = bigquery_client.query(query)
unemployment_data = query_job.to_dataframe()

In [5]:
unemployment_data = unemployment_data.drop_duplicates()

In [6]:
unemployment_data.fillna(0, inplace=True)

In [7]:
unemployment_data = unemployment_data.filter(['uu_id', 'week_number','total_claims','edu_8th_or_less', 'edu_grades_9_11',
       'edu_hs_grad_equiv', 'edu_post_hs','race_amerindian', 'race_asian', 'race_black','race_white'])

In [8]:
unemployment_data.isnull().sum()

uu_id                0
week_number          0
total_claims         0
edu_8th_or_less      0
edu_grades_9_11      0
edu_hs_grad_equiv    0
edu_post_hs          0
race_amerindian      0
race_asian           0
race_black           0
race_white           0
dtype: int64

In [9]:
unemployment_data = unemployment_data.sort_values(by=['uu_id','week_number']).reset_index()

In [10]:
unemployment_data = unemployment_data.drop('index', axis=1)

In [11]:
unemployment_data.head()

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,race_amerindian,race_asian,race_black,race_white
0,001cd9ae23064d7f0fd3cd327c873d8d,31,34,0,0,30,0,0,0,0,34
1,001cd9ae23064d7f0fd3cd327c873d8d,33,10,0,0,0,0,0,0,0,10
2,001cd9ae23064d7f0fd3cd327c873d8d,34,25,0,0,0,0,0,0,0,0
3,001cd9ae23064d7f0fd3cd327c873d8d,35,10,0,0,0,0,0,0,0,0
4,001cd9ae23064d7f0fd3cd327c873d8d,36,10,0,0,0,0,0,0,0,0


In [12]:
def predict_claims(uuid, week):
    data = unemployment_data[unemployment_data.uu_id == uuid].copy()

    # plt.plot(data.week_number, data.total_claims)
    # plt.show()
    
    X = data.drop(['uu_id','total_claims'], axis = 1)
    y = data[['total_claims']]
    
    
    
    # Splitting data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 101)
    # Train the Model

    regr = RandomForestRegressor(n_estimators = 10, max_depth = 10, random_state = 101)
    regr.fit(X_train, y_train.values.ravel())
    
    predictions = regr.predict(X_test)
    
    result = X_test
    result['total_claims'] = y_test
    result['prediction'] = predictions.tolist()
    
    x_axis = X_test.week_number

    plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
    plt.scatter(x_axis, predictions, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')

    plt.xlabel('Week Number')
    plt.ylabel('Total Claims')
    plt.title('Tract: '+uuid)

    plt.grid(color = '#D3D3D3', linestyle = 'solid')

    plt.legend(loc = 'lower right')

    plt.show()

In [13]:
    result = result.sort_values(by = 'week_number')
    
    return result.prediction.iloc[-1].round()

NameError: name 'result' is not defined

In [14]:
predict_claims('0392ee82d61e6b95e117d22d8f732b12',39)

ValueError: With n_samples=0, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [15]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [16]:
query_job = bigquery_client.query(query)
prediction_list = query_job.to_dataframe()

In [17]:
prediction_list

Unnamed: 0,uu_id,week_number
0,5bf51fc2e162d6faf9e3cf79e4198378,44
1,420b44cc7e3f55d738df565421e59941,44
2,e39c66ecceec76ee8f9f811fa4a2d246,44
3,a90462cd11ae4e43144239bf7c4828a4,44
4,8b20a6749088c7ff1237983076ebfeaa,44
...,...,...
520,46c4f6c75e663b1ca82ea7994e6d83d3,44
521,1deebda501712e7595b531b8337bc31a,44
522,5a9758f65f001b6432ff31ff64a459d7,44
523,e8b3b95e93a6dc7dbb90f4e72e7ac065,44


In [18]:
uuids = prediction_list.uu_id.tolist()

In [19]:
import csv 

In [20]:
fields = ['uu_id', 'week_number', 'total_claims']
rows = []
for uuid in uuids:
    rows.append([uuid, 39, predict_claims(uuid, 39)])

ValueError: With n_samples=4, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [21]:
filename = 'submission_prediction_output.csv'

In [22]:
def predict_claims(uuid, week):
    data = unemployment_data[unemployment_data.uu_id == uuid]

    # plt.plot(data.week_number, data.total_claims)
    # plt.show()
    
    X = data.drop(['uu_id','total_claims'], axis = 1)
    y = data[['total_claims']]
    
    
    
    # Splitting data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 101)
    # Train the Model

    regr = RandomForestRegressor(n_estimators = 10, max_depth = 10, random_state = 101)
    regr.fit(X_train, y_train.values.ravel())
    
    predictions = regr.predict(X_test)
    
    result = X_test
    result['total_claims'] = y_test
    result['prediction'] = predictions.tolist()
    
    x_axis = X_test.week_number

    plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
    plt.scatter(x_axis, predictions, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')

    plt.xlabel('Week Number')
    plt.ylabel('Total Claims')
    plt.title('Tract: '+uuid)

    plt.grid(color = '#D3D3D3', linestyle = 'solid')

    plt.legend(loc = 'lower right')

    plt.show()

In [23]:
    result = result.sort_values(by = 'week_number')
    
    return result.prediction.iloc[-1].round()

NameError: name 'result' is not defined

In [24]:
predict_claims('0392ee82d61e6b95e117d22d8f732b12',39)

ValueError: With n_samples=0, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [25]:
fields = ['uu_id', 'week_number', 'total_claims']
rows = []
for uuid in uuids:
    rows.append([uuid, 39, predict_claims(uuid, 39)])

ValueError: With n_samples=4, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [26]:
filename = 'submission_prediction_output.csv'

In [27]:
get_ipython().run_line_magic('pip', 'install db-dtypes')

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m



[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
You should consider upgrading via the '/opt/homebrew/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

Note: you may need to restart the kernel to use updated packages.


In [28]:
import db_dtypes
import matplotlib.pyplot as plt
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np
import seaborn as sns

In [29]:
import db_dtypes
import matplotlib.pyplot as plt
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np
import seaborn as sns

In [30]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [31]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""

In [32]:
query_job = bigquery_client.query(query)
unemployment_data = query_job.to_dataframe()

In [33]:
unemployment_data = unemployment_data.drop_duplicates()

In [34]:
unemployment_data.fillna(0, inplace=True)

In [35]:
unemployment_data = unemployment_data.filter(['uu_id', 'week_number','total_claims','edu_8th_or_less', 'edu_grades_9_11',
       'edu_hs_grad_equiv', 'edu_post_hs','race_amerindian', 'race_asian', 'race_black','race_white'])

In [36]:
unemployment_data.isnull().sum()

uu_id                0
week_number          0
total_claims         0
edu_8th_or_less      0
edu_grades_9_11      0
edu_hs_grad_equiv    0
edu_post_hs          0
race_amerindian      0
race_asian           0
race_black           0
race_white           0
dtype: int64

In [37]:
unemployment_data = unemployment_data.sort_values(by=['uu_id','week_number']).reset_index()

In [38]:
unemployment_data = unemployment_data.drop('index', axis=1)

In [39]:
unemployment_data.head()

Unnamed: 0,uu_id,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,race_amerindian,race_asian,race_black,race_white
0,001cd9ae23064d7f0fd3cd327c873d8d,31,34,0,0,30,0,0,0,0,34
1,001cd9ae23064d7f0fd3cd327c873d8d,33,10,0,0,0,0,0,0,0,10
2,001cd9ae23064d7f0fd3cd327c873d8d,34,25,0,0,0,0,0,0,0,0
3,001cd9ae23064d7f0fd3cd327c873d8d,35,10,0,0,0,0,0,0,0,0
4,001cd9ae23064d7f0fd3cd327c873d8d,36,10,0,0,0,0,0,0,0,0


In [40]:
uuids = unemployment_data.uu_id.unique()

In [41]:
uuids[:10]

array(['001cd9ae23064d7f0fd3cd327c873d8d',
       '005be9532fd717dc36d4be318fd9ad25',
       '007c1caccff1fbb3a0b8a10790f77141',
       '009683350b175edfc6414d664e4ba873',
       '00f962ce727b8dbbf20925abd5a253dd',
       '02a7b845ab6b3fc2c09a50cdc486db2a',
       '03ba3a9a63be1a5423457246f2846292',
       '050a624d618a68e43fe31189909c644f',
       '05298117cbe0fd69f04f89c83aaac091',
       '05f47cc80d60a67d61e891e2b90c2045'], dtype=object)

In [42]:
def predict_claims(uuid, week):
    data = unemployment_data[unemployment_data.uu_id == uuid]

    # plt.plot(data.week_number, data.total_claims)
    # plt.show()
    
    X = data.drop(['uu_id','total_claims'], axis = 1)
    y = data[['total_claims']]
    
    
    
    # Splitting data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 101)
    # Train the Model

    regr = RandomForestRegressor(n_estimators = 10, max_depth = 10, random_state = 101)
    regr.fit(X_train, y_train.values.ravel())
    
    predictions = regr.predict(X_test)
    
    result = X_test
    result['total_claims'] = y_test
    result['prediction'] = predictions.tolist()
    
    x_axis = X_test.week_number

    plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
    plt.scatter(x_axis, predictions, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')

    plt.xlabel('Week Number')
    plt.ylabel('Total Claims')
    plt.title('Tract: '+uuid)

    plt.grid(color = '#D3D3D3', linestyle = 'solid')

    plt.legend(loc = 'lower right')

    plt.show()

In [43]:
    result = result.sort_values(by = 'week_number')
    
    return result.prediction.iloc[-1].round()

NameError: name 'result' is not defined

In [44]:
predict_claims('0392ee82d61e6b95e117d22d8f732b12',39)

ValueError: With n_samples=0, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [45]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [46]:
query_job = bigquery_client.query(query)
prediction_list = query_job.to_dataframe()

In [47]:
prediction_list

Unnamed: 0,uu_id,week_number
0,5bf51fc2e162d6faf9e3cf79e4198378,44
1,420b44cc7e3f55d738df565421e59941,44
2,e39c66ecceec76ee8f9f811fa4a2d246,44
3,a90462cd11ae4e43144239bf7c4828a4,44
4,8b20a6749088c7ff1237983076ebfeaa,44
...,...,...
520,46c4f6c75e663b1ca82ea7994e6d83d3,44
521,1deebda501712e7595b531b8337bc31a,44
522,5a9758f65f001b6432ff31ff64a459d7,44
523,e8b3b95e93a6dc7dbb90f4e72e7ac065,44


In [48]:
uuids = prediction_list.uu_id.tolist()

In [49]:
import csv 

In [50]:
fields = ['uu_id', 'week_number', 'total_claims']
rows = []
for uuid in uuids:
    rows.append([uuid, 39, predict_claims(uuid, 39)])

ValueError: With n_samples=4, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [51]:
filename = 'submission_prediction_output.csv'

In [52]:
predict_claims('2a63ff5339efc0d6ac8023f6d06746e2', 39)

ValueError: With n_samples=0, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [53]:
fields = ['uu_id', 'week_number', 'total_claims']
rows = []
for uuid in uuids:
    claims = predict_claims(uuid, 39)
    rows.append([uuid, 39, claims])

ValueError: With n_samples=4, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [54]:
filename = 'submission_prediction_output.csv'

In [55]:
a = predict_claims('2a63ff5339efc0d6ac8023f6d06746e2', 39)

ValueError: With n_samples=0, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [56]:
a

NameError: name 'a' is not defined

In [57]:
fields = ['uu_id', 'week_number', 'total_claims']
rows = []
for uuid in uuids:
    print(uuid)
    claims = predict_claims(uuid, 39)
    rows.append([uuid, 39, claims])

5bf51fc2e162d6faf9e3cf79e4198378
420b44cc7e3f55d738df565421e59941
e39c66ecceec76ee8f9f811fa4a2d246
a90462cd11ae4e43144239bf7c4828a4
8b20a6749088c7ff1237983076ebfeaa
489a93264d03adecc4589fd9cd34ad36
f61f291c865fe4fe2bfd6dad53e5d058
f4520092ddd64f380cc29f27329112a6
447d5984bb8a8179bff002ccd1bba162
56d73815bdda97aca8e26051183ed3ba
050a624d618a68e43fe31189909c644f
5c43cbf8f0d860e91a9160706ea9b7d8
1400fd6122c6fe7582a30ee1280beb4b
02a7b845ab6b3fc2c09a50cdc486db2a
a993d47c7143bed2a187b0cac7d17983
09fbf26479f644ad33186ec51a90d65d
ce8723e47ce346b09bc20deb5d27749e
b67c2c4abede3730932f8d53aba0341a


ValueError: With n_samples=4, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [58]:
filename = 'submission_prediction_output.csv'

In [59]:
a = predict_claims('2a62116efd5f9a6da7b4ce2803eba96d', 39)

In [60]:
a

In [61]:
fields = ['uu_id', 'week_number', 'total_claims']
claims = []
for uuid in uuids:
    claim = predict_claims(uuid, 39)
    claims.append(claim)

ValueError: With n_samples=4, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [62]:
filename = 'submission_prediction_output.csv'

In [63]:
fields = ['uu_id', 'week_number', 'total_claims']
filename = 'submission_prediction_output.csv'

In [64]:
uuid = '2a62116efd5f9a6da7b4ce2803eba96d'

In [65]:
a = predict_claims(uuid, 39)

In [66]:
a

In [67]:
for i in range(len(uuids)):
    c = predict_claims(uuids[i], 39)
    print(c)

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


ValueError: With n_samples=4, test_size=0.8 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [68]:
unemployment_data.uu_ids.count_values()

AttributeError: 'DataFrame' object has no attribute 'uu_ids'

In [69]:
unemployment_data.uu_id.count_values()

AttributeError: 'Series' object has no attribute 'count_values'

In [70]:
unemployment_data.uu_id.value_counts()

7b208b710114a393f20ec464c7849157    35
7953cef088684d6a82637fe6a9e7e7a8    35
5f9844eb68630b75decbafdfa4974943    35
618bd975f55a022bd54daf828c1f7ce5    35
627f0516c61e02f14a399ff99aaf141c    35
                                    ..
b63efd0868b06138b7c8f9da9027e457     5
b67c2c4abede3730932f8d53aba0341a     4
747f8bc2b0c8c0a04d29caa4cfe327d2     4
f7f087af0599e6b2eaa4045ba1a0be50     3
6fbb60a508283bc1fb30c13ac419941a     3
Name: uu_id, Length: 525, dtype: int64

In [71]:
unemployment_data.uu_id.value_counts().sort()

AttributeError: 'Series' object has no attribute 'sort'

In [72]:
unemployment_data.uu_id.value_counts().tolist()

[35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,


In [73]:
unemployment_data.uu_id.value_counts()

7b208b710114a393f20ec464c7849157    35
7953cef088684d6a82637fe6a9e7e7a8    35
5f9844eb68630b75decbafdfa4974943    35
618bd975f55a022bd54daf828c1f7ce5    35
627f0516c61e02f14a399ff99aaf141c    35
                                    ..
b63efd0868b06138b7c8f9da9027e457     5
b67c2c4abede3730932f8d53aba0341a     4
747f8bc2b0c8c0a04d29caa4cfe327d2     4
f7f087af0599e6b2eaa4045ba1a0be50     3
6fbb60a508283bc1fb30c13ac419941a     3
Name: uu_id, Length: 525, dtype: int64

In [74]:
less_than_ten = unemployment_data.uu_id[unemployment_data.uu_id.value_counts()<10]

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [75]:
data_count = unemployment_data.uu_id.value_counts().to_dict()

In [76]:
lessthanten = []
for uuid, cnt in data_count.items():
    if cnt <10:
        lessthanten.append(uuid)
lessthanten

['47f0290645712229fd4f8f1fe7dc6b05',
 '6657c9871fa2f67cd168e6f94354060e',
 '38e264ca88c300dc19938de0abc88ec4',
 '420b44cc7e3f55d738df565421e59941',
 'a20575f25400fc50aa8377e345b97d41',
 '5bf51fc2e162d6faf9e3cf79e4198378',
 '65c933853ba6f9ad80097936c2924aa6',
 'd6a4947ec7c2a78bd32ba4e30f3bba53',
 '248b6a72aba270fd9794b41f53813d5e',
 'f25446d4d951867ca7b984ad17c6c1e0',
 'b63901e116eff9653b0bb68135364147',
 '0aed17ab54d56c6651d331418e8db6be',
 '3934b01a1e31d0a6af96867e1afbf03c',
 '676fffaaecefffe4ce93ca264c5c3c36',
 '001cd9ae23064d7f0fd3cd327c873d8d',
 '99c1cb11273b064dc48649886138eb8e',
 'b63efd0868b06138b7c8f9da9027e457',
 'b67c2c4abede3730932f8d53aba0341a',
 '747f8bc2b0c8c0a04d29caa4cfe327d2',
 'f7f087af0599e6b2eaa4045ba1a0be50',
 '6fbb60a508283bc1fb30c13ac419941a']

In [77]:
lessthanten = []
for uuid, cnt in data_count.items():
    if cnt <10:
        lessthanten.append(uuid)

In [78]:
len(uuids - lessthanten)

TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [79]:
len(uuids)

525

In [80]:
for i in lessthanten:
    uuids.remove(i)

In [81]:
len(uuids)

504

In [82]:
fields = ['uu_id', 'week_number', 'total_claims']
rows = []
for uuid in uuids:
    rows.append([uuid, 39, predict_claims(uuid, 39)])
filename = 'submission_prediction_output.csv'

In [83]:
rows[:5]

[['e39c66ecceec76ee8f9f811fa4a2d246', 39, None],
 ['a90462cd11ae4e43144239bf7c4828a4', 39, None],
 ['8b20a6749088c7ff1237983076ebfeaa', 39, None],
 ['489a93264d03adecc4589fd9cd34ad36', 39, None],
 ['f61f291c865fe4fe2bfd6dad53e5d058', 39, None]]

In [84]:
fields = ['uu_id', 'week_number', 'total_claims']
rows = []
for uuid in uuids:
    rows.append([uuid, 39, int(predict_claims(uuid, 39))])
filename = 'submission_prediction_output.csv'

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [85]:
rows[:5]

[]

In [86]:
unemployment_data.groupby('uu_id').median()

Unnamed: 0_level_0,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,race_amerindian,race_asian,race_black,race_white
uu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
001cd9ae23064d7f0fd3cd327c873d8d,34.5,11.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005be9532fd717dc36d4be318fd9ad25,24.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0
007c1caccff1fbb3a0b8a10790f77141,15.5,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
009683350b175edfc6414d664e4ba873,14.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00f962ce727b8dbbf20925abd5a253dd,19.0,35.0,0.0,0.0,18.0,0.0,0.0,0.0,25.0,0.0
...,...,...,...,...,...,...,...,...,...,...
fcf9f3effa992cfd552696e4ae7219b6,19.0,15.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
fcfc60d8240e5a51f878fd92e2a352a3,17.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fd613eba867c6ad7350a937f743b88f2,17.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fe355ff27aa8b2242a558d5e3ec99c18,16.5,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
# 005be9532fd717dc36d4be318fd9ad25
unemployment_data.groupby('uu_id').median().get_group('005be9532fd717dc36d4be318fd9ad25')

AttributeError: 'DataFrame' object has no attribute 'get_group'

In [88]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median()
groupby_id

Unnamed: 0_level_0,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,race_amerindian,race_asian,race_black,race_white
uu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
001cd9ae23064d7f0fd3cd327c873d8d,34.5,11.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005be9532fd717dc36d4be318fd9ad25,24.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0
007c1caccff1fbb3a0b8a10790f77141,15.5,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
009683350b175edfc6414d664e4ba873,14.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00f962ce727b8dbbf20925abd5a253dd,19.0,35.0,0.0,0.0,18.0,0.0,0.0,0.0,25.0,0.0
...,...,...,...,...,...,...,...,...,...,...
fcf9f3effa992cfd552696e4ae7219b6,19.0,15.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
fcfc60d8240e5a51f878fd92e2a352a3,17.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fd613eba867c6ad7350a937f743b88f2,17.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fe355ff27aa8b2242a558d5e3ec99c18,16.5,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median()
groupby_id.get_group('005be9532fd717dc36d4be318fd9ad25')

AttributeError: 'DataFrame' object has no attribute 'get_group'

In [90]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median()
groupby_id.first()

TypeError: first() missing 1 required positional argument: 'offset'

In [91]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median()
groupby_id

Unnamed: 0_level_0,week_number,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,race_amerindian,race_asian,race_black,race_white
uu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
001cd9ae23064d7f0fd3cd327c873d8d,34.5,11.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005be9532fd717dc36d4be318fd9ad25,24.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0
007c1caccff1fbb3a0b8a10790f77141,15.5,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
009683350b175edfc6414d664e4ba873,14.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00f962ce727b8dbbf20925abd5a253dd,19.0,35.0,0.0,0.0,18.0,0.0,0.0,0.0,25.0,0.0
...,...,...,...,...,...,...,...,...,...,...
fcf9f3effa992cfd552696e4ae7219b6,19.0,15.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
fcfc60d8240e5a51f878fd92e2a352a3,17.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fd613eba867c6ad7350a937f743b88f2,17.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fe355ff27aa8b2242a558d5e3ec99c18,16.5,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median()
groupby_id.total_claims

uu_id
001cd9ae23064d7f0fd3cd327c873d8d    11.5
005be9532fd717dc36d4be318fd9ad25    19.0
007c1caccff1fbb3a0b8a10790f77141    12.0
009683350b175edfc6414d664e4ba873    17.0
00f962ce727b8dbbf20925abd5a253dd    35.0
                                    ... 
fcf9f3effa992cfd552696e4ae7219b6    15.0
fcfc60d8240e5a51f878fd92e2a352a3    15.0
fd613eba867c6ad7350a937f743b88f2    18.0
fe355ff27aa8b2242a558d5e3ec99c18    13.0
fec479d0202d6e1e3f051a9ee902ff5d    28.0
Name: total_claims, Length: 525, dtype: Float64

In [93]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median().total_claims
groupby_id

uu_id
001cd9ae23064d7f0fd3cd327c873d8d    11.5
005be9532fd717dc36d4be318fd9ad25    19.0
007c1caccff1fbb3a0b8a10790f77141    12.0
009683350b175edfc6414d664e4ba873    17.0
00f962ce727b8dbbf20925abd5a253dd    35.0
                                    ... 
fcf9f3effa992cfd552696e4ae7219b6    15.0
fcfc60d8240e5a51f878fd92e2a352a3    15.0
fd613eba867c6ad7350a937f743b88f2    18.0
fe355ff27aa8b2242a558d5e3ec99c18    13.0
fec479d0202d6e1e3f051a9ee902ff5d    28.0
Name: total_claims, Length: 525, dtype: Float64

In [94]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median().total_claims
groupby_id.to_dict()

{'001cd9ae23064d7f0fd3cd327c873d8d': 11.5,
 '005be9532fd717dc36d4be318fd9ad25': 19.0,
 '007c1caccff1fbb3a0b8a10790f77141': 12.0,
 '009683350b175edfc6414d664e4ba873': 17.0,
 '00f962ce727b8dbbf20925abd5a253dd': 35.0,
 '02a7b845ab6b3fc2c09a50cdc486db2a': 23.0,
 '03ba3a9a63be1a5423457246f2846292': 18.0,
 '050a624d618a68e43fe31189909c644f': 19.0,
 '05298117cbe0fd69f04f89c83aaac091': 15.0,
 '05f47cc80d60a67d61e891e2b90c2045': 14.0,
 '06c78e49b4daedfeb808c42e58fb25e4': 15.5,
 '06e492b4f29d153af26c659d1f7da2a1': 15.5,
 '07306ce64b3496b7955efaaf7509d73c': 13.5,
 '074f501122885ab9aef5e9d07004209d': 16.5,
 '09fbf26479f644ad33186ec51a90d65d': 16.0,
 '0abaea6bbc30d7cf8dbacc64727e7091': 14.0,
 '0ad94f09274e2c9cb0ef5cb77eb334b4': 51.0,
 '0aed17ab54d56c6651d331418e8db6be': 13.0,
 '0bdc0df962a3c10541c2aa5220ded58b': 13.0,
 '0d165590afad07ce71727d690f4aad80': 17.0,
 '0dc217a2798a141c59b99f5bcff29fa9': 12.5,
 '0e6523fb3fc17f6a2ac7050972bd4bfd': 13.5,
 '0eadc4f22396980ecc2e7878ffbaf14a': 15.0,
 '0f3d45341

In [95]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median().total_claims
groupby_id[groupby_id.uu_id == '005be9532fd717dc36d4be318fd9ad25']

AttributeError: 'Series' object has no attribute 'uu_id'

In [96]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median().total_claims
groupby_id[groupby_id == '005be9532fd717dc36d4be318fd9ad25']

Series([], Name: total_claims, dtype: Float64)

In [97]:
# 005be9532fd717dc36d4be318fd9ad25
groupby_id = unemployment_data.groupby('uu_id').median().total_claims
groupby_id['005be9532fd717dc36d4be318fd9ad25']

19.0

In [98]:
for uuid in lessthanten:
    rows.append([uuid, 39, int(unemployment_data.groupby('uu_id').median().total_claims[uuid])])

In [99]:
len(rows)

21

In [100]:
rows[-10:-1]

[['0aed17ab54d56c6651d331418e8db6be', 39, 13],
 ['3934b01a1e31d0a6af96867e1afbf03c', 39, 15],
 ['676fffaaecefffe4ce93ca264c5c3c36', 39, 11],
 ['001cd9ae23064d7f0fd3cd327c873d8d', 39, 11],
 ['99c1cb11273b064dc48649886138eb8e', 39, 11],
 ['b63efd0868b06138b7c8f9da9027e457', 39, 13],
 ['b67c2c4abede3730932f8d53aba0341a', 39, 14],
 ['747f8bc2b0c8c0a04d29caa4cfe327d2', 39, 11],
 ['f7f087af0599e6b2eaa4045ba1a0be50', 39, 11]]

In [101]:
with open(filename, 'w') as csvfile: 
     
    csvwriter = csv.writer(csvfile) 

    csvwriter.writerow(fields) 

    csvwriter.writerows(rows)

In [102]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [103]:
less_than_ten = unemployment_data.uu_id[unemployment_data.uu_id.value_counts()<10]

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [104]:
Since the test is not workable for ids that the number of recorded weeks is less than 3, I separated the ids whose data count is less than 10 to predict the claim different.

SyntaxError: invalid syntax (111755279.py, line 1)