In [201]:
from datetime import datetime
now = datetime.now()
run_start_time= now.strftime("%H:%M:%S")
print("Run Started at: ", run_start_time)

Run Started at:  20:27:09


In [202]:
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Core functions for Probabilistic (BTYD) models."""

from __future__ import print_function
from __future__ import absolute_import

from datetime import datetime
from lifetimes import BetaGeoFitter, ParetoNBDFitter, GammaGammaFitter
import math
import numpy as np
import os
import pandas as pd
import tensorflow as tf

In [203]:
#os.chdir('/home/jupyter/CLV Update - April 2022/GFS CLV/tensorflow_lifetime_value/clv_mle/trainer/12M Prediction')
os.getcwd()

'/home/jupyter/CLV - Covid Adjusted/Revenue'

In [204]:
import sys

#BIGQUERY CONNECTION -DC

from google.cloud import bigquery
import pandas
#from .model import PARETO, BGNBD

In [205]:
#sys.path.insert(0, '..')

client = bigquery.Client(location="US")

query = """
    SELECT * 
    FROM `gcp-gfs-datalab-e4sdt.clv_cov_adj.features_training`
"""

query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
)  # API request - starts the query


df_in = query_job.to_dataframe()
df_in
#BIGQUERY CONNECTION END -DC


Unnamed: 0,customer_id,monetary_dnn,monetary_btyd,frequency_dnn,frequency_btyd,recency,T,time_between,avg_basket_value,avg_basket_size,cnt_returns,has_returned,frequency_btyd_clipped,monetary_btyd_clipped,target_monetary_clipped,target_monetary
0,100130857_15,23215.79,90.982863,256,255,1519,1519,5.93,90.69,2.90,0,0,255,90.98,23980.75,23980.75
1,100109491_7,54927.80,213.288314,256,255,1518,1520,5.93,214.56,7.11,0,0,255,213.29,4603.97,4603.97
2,294680012_3,268839.13,1048.311333,256,255,1510,1515,5.90,1050.15,26.24,0,0,255,1048.31,31884.09,31884.09
3,292550014_13,120011.22,470.414627,256,255,1510,1514,5.90,468.79,5.11,0,0,255,470.41,38337.30,38337.30
4,100172020_13,16126.20,62.608902,256,255,1423,1423,5.56,62.99,2.91,0,0,255,62.61,4625.64,4625.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32614,100063939_3,355049.13,461.551018,767,766,1521,1521,1.98,462.91,15.71,5,1,600,461.55,100000.00,135129.63
32615,100155216_1,1356461.30,1769.223381,767,766,1514,1514,1.97,1768.53,41.50,19,1,600,1769.22,100000.00,424213.30
32616,100152109_39,531192.93,693.392272,767,766,1521,1521,1.98,692.56,20.63,7,1,600,693.39,15453.02,15453.02
32617,258240014_13,2556563.35,2500.002730,1023,1022,1520,1521,1.49,2499.08,55.69,2,1,600,2500.00,100000.00,902003.64


In [206]:
#DC: saving the dataframe locally to perform calculations further down
df_in.to_csv('actual_df.csv')


In [207]:
PENALIZER_COEF = 0.01
DISCOUNT_RATE = .01
TRAINING_DATA_FILE = df_in
OUTPUT_FILE = 'predictions.csv'


def load_data(datapath):
  """Loads data from CSV data file.

  Args:
    datapath: Location of the training file
  Returns:
    summary dataframe containing RFM data for btyd models
    actuals_df containing additional data columns for calculating error
  """
  # Does not used the summary_data_from_transaction_data from the Lifetimes
  # library as it wouldn't scale as well. The pre-processing done in BQ instead.
  tf.logging.info('Loading data...')

  ft_file = '{0}/{1}'.format(datapath, TRAINING_DATA_FILE)
#[START prob_selec]
  df_ft = df_in

  # Extracts relevant dataframes for RFM:
  # - summary has aggregated values before the threshold date
  # - actual_df has values of the overall period.
  summary = df_ft[['customer_id', 'frequency_btyd', 'recency', 'T',
                   'monetary_btyd']]
#[END prob_selec]
  summary.columns = ['customer_id', 'frequency', 'recency', 'T',
                     'monetary_value']
  summary = summary.set_index('customer_id')

  # additional columns needed for calculating error
  actual_df = df_ft[['customer_id', 'frequency_btyd', 'monetary_dnn',
                     'target_monetary']]
  actual_df.columns = ['customer_id', 'train_frequency', 'train_monetary',
                       'act_target_monetary']

  tf.logging.info('Data loaded.')

  return summary, actual_df

In [208]:
def bgnbd_model(summary):
  """Instantiate and fit a BG/NBD model.

  Args:
    summary: RFM transaction data
  Returns:
    bgnbd model fit to the data
  """
  bgf = BetaGeoFitter(penalizer_coef=PENALIZER_COEF)
  bgf.fit(summary['frequency'], summary['recency'], summary['T'])
  return bgf

In [209]:
def paretonbd_model(summary):
  """Instantiate and fit a Pareto/NBD model.

  Args:
    summary: RFM transaction data
  Returns:
    bgnbd model fit to the data
  """
  #[START run_btyd]
  paretof = ParetoNBDFitter(penalizer_coef=PENALIZER_COEF)
  paretof.fit(summary['frequency'], summary['recency'], summary['T'])
  return paretof
  #[END run_btyd]

In [210]:
def run_btyd(data_src, threshold_date, predict_end):
  """Run selected BTYD model on data files located in args.data_src.

  Args:
    model_type:                 model type (PARETO, BGNBD) #DC 10-18-21: ADD THIS BACK IN IF YOU WANT TO CHOOSE BETWEEN PARETO & BGNBD
    data_src:                   path to data
    threshold_date:             end date for training data 'YYYY-mm-dd'
    predict_end:                end date for predictions 'YYYY-mm-dd'
  """
  train_end_date = datetime.strptime(threshold_date, '%Y-%m-%d')
  predict_end_date = datetime.strptime(predict_end, '%Y-%m-%d')

  # load training transaction data
  summary, actual_df = load_data(data_src)

  # train fitter for selected model
  tf.logging.info('Fitting model...')

#  if model_type == PARETO:
#    fitter = paretonbd_model(summary)
#  elif model_type == BGNBD:       #DC 10-18-21: ADD THIS BACK IN IF YOU WANT TO CHOOSE BETWEEN PARETO & BGNBD
  fitter = bgnbd_model(summary)

  tf.logging.info('Done.')

  #
  # use trained fitter to compute actual vs predicted ltv for each user
  #

  # compute the number of days in the prediction period
  time_days = (predict_end_date - train_end_date).days
  time_months = int(math.ceil(time_days / 30.0))

  # fit gamma-gamma model
  tf.logging.info('Fitting GammaGamma model...')

  ggf = GammaGammaFitter(penalizer_coef=0)
  ggf.fit(summary['frequency'], summary['monetary_value'])

  tf.logging.info('Done.')

  ltv, rmse = predict_value(summary,
                            actual_df,
                            fitter,
                            ggf,
                            time_days,
                            time_months)

  # output results to csv
  output_file = os.path.join(data_src, OUTPUT_FILE)
  ltv.to_csv(output_file, index=False)

  # log results
  tf.logging.info('BTYD RMSE error for %s model: %.2f', 'BGNBD', rmse) #DC 10-18-21 - CHANGE 'BGNBD' TO THE model_type VARIABLE IF YOU CHOSE TO ALLOW FOR CHOOSING IN THIS FUNCTION
  print('RMSE prediction error: %.2f' % rmse)

  return ggf

In [211]:
def predict_value(summary, actual_df, fitter, ggf, time_days, time_months):
  """Predict lifetime values for customers.

  Args:
    summary:      RFM transaction data
    actual_df:    dataframe containing data fields for customer id,
                  actual customer values
    fitter:       lifetimes fitter, previously fit to data
    ggf:          lifetimes gamma/gamma fitter, already fit to data
    time_days:    time to predict purchases in days
    time_months:  time to predict value in months
  Returns:
    ltv:  dataframe with predicted values for each customer, along with actual
      values and error
    rmse: root mean squared error summed over all customers
  """
  # setup dataframe to hold results
  ltv = pd.DataFrame(data=np.zeros([actual_df.shape[0], 6]),
                     columns=['customer_id',
                              'actual_total',
                              'predicted_num_purchases',
                              'predicted_value',
                              'predicted_total',
                              'error'], dtype=np.float32)

  predicted_num_purchases = fitter.predict(time_days,
                                           summary['frequency'],
                                           summary['recency'],
                                           summary['T'])

  predicted_value = ggf.customer_lifetime_value(fitter,
                                                summary['frequency'],
                                                summary['recency'],
                                                summary['T'],
                                                summary['monetary_value'],
                                                time=time_months,
                                                discount_rate=DISCOUNT_RATE)

  ltv['customer_id'] = actual_df['customer_id']
  ltv['actual_total'] = actual_df['act_target_monetary']
  ltv['predicted_num_purchases'] = predicted_num_purchases.values
  ltv['predicted_value'] = predicted_value.values
  ltv['predicted_total'] = actual_df['train_monetary'] + ltv['predicted_value']
  ltv['error'] = ltv['actual_total'] - ltv['predicted_total']

  mse = pd.Series.sum(ltv['error'] * ltv['error']) / ltv.shape[0]
  rmse = math.sqrt(mse)

  return ltv, rmse

In [212]:
cleaned_columns=load_data(df_in)

INFO:tensorflow:Loading data...
INFO:tensorflow:Data loaded.


In [213]:
fit_model=bgnbd_model(cleaned_columns[0])

In [214]:
model_run=run_btyd(os.getcwd(),'2021-05-22','2022-05-21')
#MAKE SURE TO ADJUST THIS EACH RUN


INFO:tensorflow:Loading data...
INFO:tensorflow:Data loaded.
INFO:tensorflow:Fitting model...
INFO:tensorflow:Done.
INFO:tensorflow:Fitting GammaGamma model...
INFO:tensorflow:Done.
INFO:tensorflow:BTYD RMSE error for BGNBD model: 530874.78
RMSE prediction error: 530874.78


In [215]:
#Prediction run for training data
predict_cltv=predict_value(cleaned_columns[0],cleaned_columns[1],fit_model,model_run,365,12)

#Save Output training predictions
predict_cltv[0].to_csv('training_out.csv')
#Display RMSE of training predictions 
predict_cltv[1]

521226.6069762476

In [216]:
import statistics
outtable=predict_cltv[0]
statistics.mean(outtable.error)

-261273.6027282859

In [217]:
### true predictions ###

In [218]:
#bringing to_predict data in

query = """
    SELECT * 
    FROM `gcp-gfs-datalab-e4sdt.clv_cov_adj.features_20`
"""

query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
)  # API request - starts the query


prediction_in = query_job.to_dataframe()
prediction_in

Unnamed: 0,customer_id,monetary_dnn,monetary_btyd,frequency_dnn,frequency_btyd,recency,T,time_between,avg_basket_value,avg_basket_size,cnt_returns,has_returned,frequency_btyd_clipped,monetary_btyd_clipped,target_monetary_clipped,target_monetary
0,100258317_39,298807.10,1162.593672,257,256,543,582,2.11,1162.67,38.15,21,1,256,1162.59,1.0,1
1,100169277_26,444665.38,1729.267266,257,256,728,728,2.83,1730.22,37.84,27,1,256,1729.27,1.0,1
2,100001253_7,719896.80,2811.544180,257,256,730,730,2.84,2801.15,48.68,28,1,256,2811.54,1.0,1
3,930023211_6,843630.95,3276.870000,257,256,725,728,2.82,3282.61,89.27,20,1,256,3276.87,1.0,1
4,438720014_1,360646.85,1400.579219,257,256,728,728,2.83,1403.30,27.51,27,1,256,1400.58,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40052,100030029_7,638688.66,2594.414204,246,245,729,729,2.96,2596.30,69.32,17,1,245,2594.41,1.0,1
40053,100240854_1,864724.17,3499.387045,248,247,665,669,2.68,3486.79,78.37,17,1,247,3499.39,1.0,1
40054,100134767_40,524538.55,2106.390241,250,249,642,729,2.57,2098.15,46.31,17,1,249,2106.39,1.0,1
40055,100081282_26,340799.69,1355.742908,252,251,730,730,2.90,1352.38,31.98,17,1,251,1355.74,1.0,1


In [219]:
PREDICTION_DATA_FILE=prediction_in

def load_prediction_data(datapath):
  """Loads data from CSV data file.

  Args:
    datapath: Location of the training file
  Returns:
    summary dataframe containing RFM data for btyd models
    actuals_df containing additional data columns for calculating error
  """
  # Does not used the summary_data_from_transaction_data from the Lifetimes
  # library as it wouldn't scale as well. The pre-processing done in BQ instead.
  tf.logging.info('Loading data...')

  ft_file = '{0}/{1}'.format(datapath, PREDICTION_DATA_FILE)
#[START prob_selec]
  df_ft = PREDICTION_DATA_FILE

  # Extracts relevant dataframes for RFM:
  # - summary has aggregated values before the threshold date
  # - actual_df has values of the overall period.
  summary = df_ft[['customer_id', 'frequency_btyd', 'recency', 'T',
                   'monetary_btyd']]
#[END prob_selec]
  summary.columns = ['customer_id', 'frequency', 'recency', 'T',
                     'monetary_value']
  summary = summary.set_index('customer_id')

  # additional columns needed for calculating error
  actual_df = df_ft[['customer_id', 'frequency_btyd', 'monetary_dnn',
                     'target_monetary']]
  actual_df.columns = ['customer_id', 'train_frequency', 'train_monetary',
                       'act_target_monetary']

  tf.logging.info('Data loaded.')

  return summary, actual_df

In [220]:

#Prediction run for prediction data
prediction_in=load_prediction_data(prediction_in)
#Predict
predict_cltv_prediction=predict_value(prediction_in[0],prediction_in[1],fit_model,model_run,365,12)

#Save Output training predictions
predict_cltv_prediction[0].to_csv('prediction_out_2020.csv')


INFO:tensorflow:Loading data...
INFO:tensorflow:Data loaded.


In [221]:
print('done')


done


In [222]:
#bringing to_predict data in

query = """
    SELECT * 
    FROM `gcp-gfs-datalab-e4sdt.clv_cov_adj.features_21`
"""

query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
)  # API request - starts the query


prediction_in = query_job.to_dataframe()
prediction_in

Unnamed: 0,customer_id,monetary_dnn,monetary_btyd,frequency_dnn,frequency_btyd,recency,T,time_between,avg_basket_value,avg_basket_size,cnt_returns,has_returned,frequency_btyd_clipped,monetary_btyd_clipped,target_monetary_clipped,target_monetary
0,100099290_7,541027.68,2111.286680,257,256,727,729,2.83,2105.17,44.80,19,1,256,2111.29,1.0,1
1,100217569_40,970491.77,3787.740664,257,256,727,730,2.83,3776.23,73.78,17,1,256,3787.74,1.0,1
2,100028845_2,409333.78,1588.134766,257,256,729,730,2.84,1592.74,37.27,19,1,256,1588.13,1.0,1
3,100215220_40,822822.00,3198.695078,257,256,728,729,2.83,3201.64,67.99,19,1,256,3198.70,1.0,1
4,901247999_7,418843.10,1631.777461,257,256,728,729,2.83,1629.74,38.81,17,1,256,1631.78,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38559,100245860_3,259355.24,1042.643790,249,248,729,729,2.93,1041.59,26.31,16,1,248,1042.64,1.0,1
38560,368050002_39,606031.69,2411.112880,251,250,729,729,2.90,2414.47,39.35,16,1,250,2411.11,1.0,1
38561,100170908_7,857374.92,3403.740199,252,251,728,730,2.89,3402.28,68.32,16,1,251,3403.74,1.0,1
38562,604740029_13,344056.34,1354.162756,255,254,729,730,2.86,1349.24,20.39,16,1,254,1354.16,1.0,1


In [223]:
PREDICTION_DATA_FILE=prediction_in

def load_prediction_data(datapath):
  """Loads data from CSV data file.

  Args:
    datapath: Location of the training file
  Returns:
    summary dataframe containing RFM data for btyd models
    actuals_df containing additional data columns for calculating error
  """
  # Does not used the summary_data_from_transaction_data from the Lifetimes
  # library as it wouldn't scale as well. The pre-processing done in BQ instead.
  tf.logging.info('Loading data...')

  ft_file = '{0}/{1}'.format(datapath, PREDICTION_DATA_FILE)
#[START prob_selec]
  df_ft = PREDICTION_DATA_FILE

  # Extracts relevant dataframes for RFM:
  # - summary has aggregated values before the threshold date
  # - actual_df has values of the overall period.
  summary = df_ft[['customer_id', 'frequency_btyd', 'recency', 'T',
                   'monetary_btyd']]
#[END prob_selec]
  summary.columns = ['customer_id', 'frequency', 'recency', 'T',
                     'monetary_value']
  summary = summary.set_index('customer_id')

  # additional columns needed for calculating error
  actual_df = df_ft[['customer_id', 'frequency_btyd', 'monetary_dnn',
                     'target_monetary']]
  actual_df.columns = ['customer_id', 'train_frequency', 'train_monetary',
                       'act_target_monetary']

  tf.logging.info('Data loaded.')

  return summary, actual_df

In [224]:

#Prediction run for prediction data
prediction_in=load_prediction_data(prediction_in)
#Predict
predict_cltv_prediction=predict_value(prediction_in[0],prediction_in[1],fit_model,model_run,365,12)

#Save Output training predictions
predict_cltv_prediction[0].to_csv('prediction_out_2021.csv')


INFO:tensorflow:Loading data...
INFO:tensorflow:Data loaded.


In [225]:
#bringing to_predict data in

query = """
    SELECT * 
    FROM `gcp-gfs-datalab-e4sdt.clv_cov_adj.features_22`
"""

query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
)  # API request - starts the query


prediction_in = query_job.to_dataframe()
prediction_in

Unnamed: 0,customer_id,monetary_dnn,monetary_btyd,frequency_dnn,frequency_btyd,recency,T,time_between,avg_basket_value,avg_basket_size,cnt_returns,has_returned,frequency_btyd_clipped,monetary_btyd_clipped,target_monetary_clipped,target_monetary
0,100001262_38,216967.64,838.330234,257,256,721,727,2.81,844.23,24.94,18,1,256,838.33,1.0,1
1,722320530_38,414805.53,1607.249570,257,256,721,724,2.81,1614.03,39.00,23,1,256,1607.25,1.0,1
2,722409764_38,417914.92,1628.028164,257,256,722,723,2.81,1626.13,36.76,32,1,256,1628.03,1.0,1
3,100121356_2,663931.54,2583.772383,257,256,721,723,2.81,2583.39,50.36,17,1,256,2583.77,1.0,1
4,100081169_26,625620.84,2436.413828,257,256,728,730,2.83,2434.32,48.25,29,1,256,2436.41,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36734,100087563_26,563209.26,2283.003943,247,246,725,727,2.94,2280.20,59.31,16,1,246,2283.00,1.0,1
36735,722362680_38,541752.61,2182.766518,248,247,526,527,2.12,2184.49,43.09,16,1,247,2182.77,1.0,1
36736,481860001_7,89021.73,360.310405,248,247,728,730,2.94,358.96,8.48,16,1,247,360.31,1.0,1
36737,722415408_38,311069.91,1250.317177,249,248,729,730,2.93,1249.28,26.62,16,1,248,1250.32,1.0,1


In [226]:
PREDICTION_DATA_FILE=prediction_in

def load_prediction_data(datapath):
  """Loads data from CSV data file.

  Args:
    datapath: Location of the training file
  Returns:
    summary dataframe containing RFM data for btyd models
    actuals_df containing additional data columns for calculating error
  """
  # Does not used the summary_data_from_transaction_data from the Lifetimes
  # library as it wouldn't scale as well. The pre-processing done in BQ instead.
  tf.logging.info('Loading data...')

  ft_file = '{0}/{1}'.format(datapath, PREDICTION_DATA_FILE)
#[START prob_selec]
  df_ft = PREDICTION_DATA_FILE

  # Extracts relevant dataframes for RFM:
  # - summary has aggregated values before the threshold date
  # - actual_df has values of the overall period.
  summary = df_ft[['customer_id', 'frequency_btyd', 'recency', 'T',
                   'monetary_btyd']]
#[END prob_selec]
  summary.columns = ['customer_id', 'frequency', 'recency', 'T',
                     'monetary_value']
  summary = summary.set_index('customer_id')

  # additional columns needed for calculating error
  actual_df = df_ft[['customer_id', 'frequency_btyd', 'monetary_dnn',
                     'target_monetary']]
  actual_df.columns = ['customer_id', 'train_frequency', 'train_monetary',
                       'act_target_monetary']

  tf.logging.info('Data loaded.')

  return summary, actual_df

In [227]:

#Prediction run for prediction data
prediction_in=load_prediction_data(prediction_in)
#Predict
predict_cltv_prediction=predict_value(prediction_in[0],prediction_in[1],fit_model,model_run,365,12)

#Save Output training predictions
predict_cltv_prediction[0].to_csv('prediction_out_2022.csv')


INFO:tensorflow:Loading data...
INFO:tensorflow:Data loaded.


In [232]:
#sys.path.insert(0, '..')

client = bigquery.Client(location="US")

query = """
    SELECT * 
    FROM `gcp-gfs-datalab-e4sdt.clv_cov_adj.rev_pred_summary`
"""

query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
)  # API request - starts the query


df_in = query_job.to_dataframe()
df_in['Offby_20']=df_in['Predicted_2020']-df_in['Actual_2020']
df_in['Error_20']=df_in['Offby_20']/df_in['Actual_2020']
df_in['Offby_21']=df_in['Predicted_2021']-df_in['Actual_2021']
df_in['Error_21']=df_in['Offby_21']/df_in['Actual_2021']
df_in
#BIGQUERY CONNECTION END -DC

Unnamed: 0,customer_id,Actual_2020,Predicted_2020,Actual_2021,Predicted_2021,Predicted_2022,Offby_20,Error_20,Offby_21,Error_21
0,100267739_6,2237.8314,1.491276e+04,0.0000,1.262241e+04,2476.785596,12674.930099,5.663934,12622.411574,inf
1,100118360_2,285.6506,4.226079e+03,186.5000,6.935828e+03,4936.729670,3940.427950,13.794573,6749.328337,36.189428
2,100110181_6,30405.3303,2.070471e+05,41002.0625,1.826399e+05,110829.220308,176641.801386,5.809567,141637.848944,3.454408
3,100170186_2,27242.4177,1.175721e+05,67567.8750,1.154698e+05,145540.160082,90329.676978,3.315773,47901.892020,0.708945
4,100123962_36,368640.8094,1.141020e+06,204678.6875,1.180704e+06,574130.920727,772378.802465,2.095207,976024.815583,4.768571
...,...,...,...,...,...,...,...,...,...,...
24567,100260533_6,32464.3417,7.500049e+04,53958.1250,9.394974e+04,127922.879896,42536.148951,1.310242,39991.617181,0.741160
24568,100099993_15,153655.5997,3.434764e+05,180184.6250,4.061601e+05,487984.520870,189820.789422,1.235365,225975.460417,1.254133
24569,100148255_38,53169.3274,1.635293e+05,37438.4375,1.493471e+05,136093.790040,110359.977532,2.075632,111908.651426,2.989138
24570,100009173_7,3175.0954,1.265224e+04,12885.8125,1.632738e+04,34041.380677,9477.144913,2.984838,3441.565698,0.267082


In [233]:
e_20=[]
e_21=[]
for i in range(2500):    
    samp=df_in.sample(n=250)
    avg_er_20=samp['Error_20'].mean()
    avg_er_21=samp['Error_21'].mean()
    e_20.append(avg_er_20)
    e_21.append(avg_er_21)
    
#e_20       
#e_21 # the list with the new items.

In [234]:
errors_2020=pd.DataFrame(e_20)
errors_2021=pd.DataFrame(e_21)

errors_2020.to_csv('errors_2020.csv')
errors_2021.to_csv('errors_2021.csv')

In [235]:
now = datetime.now()
run_end_time = now.strftime("%H:%M:%S")
print("done at", run_end_time)


done at 20:55:17
