# Raw Data
Data is splitted to Estimation (aka train) and Competition (aka test) sets 
## Description Data
From https://web.archive.org/web/20201018045339/https://sites.google.com/site/techpredcomp/rawdecrisk
## Experience Data
https://web.archive.org/web/20201018045340/https://sites.google.com/site/techpredcomp/raw-data-experience


In [1]:
import pandas as pd
import os
from glob import glob

In [2]:
input_path = "../data/2010/raw"
output_path = "../data/2010/processed"
description_folders = {
    "RawDecRiskEstComp08": "description_comp.csv",
    "RawDesComp": "description_est.csv",
}
experience_files = {
    "RawExpEst_Mar08.txt": "experience_est.csv",
    "RawExpComp_May08.txt": "experience_comp.csv",
}
# The variables are organized as follows: Problem, Order, High, P(high), Low, Medium, Choice.
description_header = ["Problem", "Order", "High", "Phigh", "Low", "Medium", "Choice"]

In [3]:
os.makedirs(output_path, exist_ok=True)
# experience files
def parse_experience(input, output):
    print(f"reading {input}")
    df = pd.read_csv(f"{input_path}/{input}", sep=r"\s+")
    # Camel case column names
    df.columns = df.columns.str.lower().str.capitalize()
    # rename risk to choice
    df = df.rename(columns={"Risk": "Choice"})
    print(f"writing {output}")
    df.to_csv(f"{output_path}/{output}", index=False)
    print(f"Processed {input} to {output}")


for input, ouput in experience_files.items():
    input_filename = f"{input_path}/{input}"
    output_filename = f"{output_path}/{ouput}"
    parse_experience(input, ouput)

reading RawExpEst_Mar08.txt
writing experience_est.csv
Processed RawExpEst_Mar08.txt to experience_est.csv
reading RawExpComp_May08.txt
writing experience_comp.csv
Processed RawExpComp_May08.txt to experience_comp.csv


In [4]:
# description files
def parse_description(input_folder, output_filename):
    files = glob(f"{input_path}/{input_folder}/*.txt")
    print(f"Found {len(files)} files")
    df_list = []
    for input_filename in files:
        df = pd.read_csv(input_filename, sep=r"\s+", header=None, names=description_header)
        df_list.append(df)
    df = pd.concat(df_list)
    print(f"writing {output_filename}")
    df.to_csv(f"{output_path}/{output_filename}", index=False)
    print(f"Processed {input_folder} to {output_filename}")


for folder_name, output_filename in description_folders.items():
    parse_description(folder_name, output_filename)

Found 20 files
writing description_comp.csv
Processed RawDecRiskEstComp08 to description_comp.csv
Found 20 files
writing description_est.csv
Processed RawDesComp to description_est.csv
