# EV3 Character Recognition Experiment - Clean

1. Process raw data - clean and categorise into rotation buckets
2. Add target value column to clean file
3. Save clean files to folder
4. Concatenate all the clean data files into a single training file
5. Save training data file

## Process raw data results

In [2]:
import numpy as np
import pandas as pd

In [3]:
!pwd

/Users/garry/Documents/GitHub/datapiquing/lego/character recognition/ev3/numbers/number_predict


In [4]:
def create_rotation_dict():
    '''Initialise a dictionary with keys of 0 to 360 at 10 degree intervals, each with value zero'''
    degree_bins = list(range(0,361,10))
    rotation_dict = {}
    for bin in degree_bins:
        rotation_dict[bin] = 0
    return rotation_dict

In [5]:
# A function to place rotation values into buckets of 10 degrees (+ or - 5 degrees either side)
# These will be the feature columns of each sample rotation

def rotation_buckets(degrees):
    '''Calculate which 360 degree bucket (width=10) the rotation belongs'''
    if degrees > 360 and degrees%360 < 5:
        degrees = (degrees - 5)%360
    else:
        degrees = degrees%360
    modulus = degrees%10
    if modulus < 5:
        bucket = degrees - modulus
    else:
        bucket = (degrees - modulus) + 10
    return bucket

In [6]:
def create_clean_dataframe(filename):
    '''Read in raw data file and return clean DataFrame with target column'''

    target_value = int(filename[5:6])
    
    df_raw_data = pd.read_csv(f'./raw_data/{filename}')
    
    # Remove the space that prefixes the 'reflectivity' column name
    df_raw_data.columns = ['angle', 'reflectivity']
    
    # The experiment was 1 run of X x 360 degree rotations
    # Put actual rotation angle into buckets of 10 degrees between 0 and 360 degrees
    # Concatenate each run of X rotations vertically as tuples of (bucket_degrees, reflectivity_percentage)

    results_list = []
    rotation_column = 'angle'
    reflectivity_column = 'reflectivity'
    for index, row in df_raw_data.iterrows():
        prev_bucket_degrees = -1
        actual_degrees = row[rotation_column]
        bucket_degrees = rotation_buckets(actual_degrees)
        relectivity_percentage = row[reflectivity_column]
        #print(actual_degrees, bucket_degrees, relectivity_percentage)
        results_list.append((bucket_degrees, relectivity_percentage))
        
    # Convert the list of result tuples (rotation angle, refelctivity) to a dictionary of dictionaries where
    # the key is the run number and the value is a dictionary of rotation angles (from 0 to 360 degrees at 
    # 10 degree intervals) and their respective reflectivity value

    run = 0
    results = {}
    dict = create_rotation_dict()
    at_360 = False
    for tuple in results_list:
        if tuple[0] < 360 or at_360 == False:
            dict[tuple[0]] = tuple[1]    
            if tuple[0] >= 360:
                results[run] = dict
                run += 1
                dict = create_rotation_dict()

        at_360 = (tuple[0] >= 360) 
        
    # Create a DataFrame from the angle vs reflectivity results

    df_results = pd.DataFrame.from_dict(results)
    
    # Transpose the DataFrame so each rotation sample is a row and each 10 degree angle of rotation 
    # between 0 and 360 degrees is a feature column

    df_results = df_results.transpose()
    
    # Impute and zero values
    # For angles 0 - 10 degrees use the value of 360 degrees

    df_results[0] = np.where(df_results[0] == 0, df_results[360], df_results[0])
    
    # Where a cell value is zero convert to NaN

    df_results[df_results.eq(0)] = np.nan
    
    # Replace NaN values with the column mean

    df_results.fillna(df_results.mean().astype(int), inplace=True)
    df_results = df_results.astype(int)
        
    # Create a target column with the value of the number scanned
    df_results['target'] = target_value
    
    # Export the number DataFrame to a .csv file
    df_results.to_csv(f'./clean_data/{filename}', index=False)
    
    return df_results

In [7]:
# Get a list of all raw data files collected
filename = 'train'
all_filenames = !ls ./raw_data | grep -e $filename
print(all_filenames)

['train0_2020_04_12_07_36_54_845209.csv', 'train1_2020_04_12_07_31_05_399593.csv', 'train2_2020_04_12_07_31_38_675491.csv', 'train3_2020_04_12_07_32_34_012405.csv', 'train4_2020_04_12_07_33_11_693753.csv', 'train5_2020_04_12_07_33_59_050550.csv', 'train6_2020_04_12_07_34_34_754518.csv', 'train7_2020_04_12_07_35_10_862536.csv', 'train8_2020_04_12_07_35_48_986449.csv', 'train9_2020_04_12_07_36_22_538481.csv']


In [8]:
def combine_cleaned_dataframes(filename_list):
    ''''''

    df_clean_results = pd.DataFrame()

    for filename in all_filenames:
        df = create_clean_dataframe(filename)
        df_clean_results = pd.concat([df_clean_results, df], ignore_index=True) 

    # Export the number DataFrame to a .csv file
    df_clean_results.to_csv(f'./clean_data/training_dataset.csv', index=False)
    
    return df_clean_results

In [9]:
dataset = combine_cleaned_dataframes(all_filenames)
dataset

Unnamed: 0,0,10,20,30,40,50,60,70,80,90,...,280,290,300,310,320,330,340,350,360,target
0,11,11,11,11,11,10,11,11,11,11,...,8,8,9,9,11,12,11,10,11,0
1,11,12,11,11,11,10,10,11,11,11,...,7,8,9,10,12,11,11,10,11,0
2,11,11,11,11,10,10,10,11,11,11,...,8,8,9,9,11,11,11,10,11,0
3,11,12,11,11,10,11,11,11,11,11,...,8,8,9,10,11,11,11,11,11,0
4,11,11,11,11,11,11,10,11,10,11,...,8,8,9,10,12,12,11,11,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4,5,5,7,8,8,8,9,10,9,...,5,6,6,7,7,6,5,5,4,9
96,5,5,5,7,8,8,9,10,9,9,...,6,6,5,6,7,6,6,5,5,9
97,5,5,6,7,8,8,9,9,9,9,...,5,5,6,6,6,6,5,4,5,9
98,4,5,6,7,8,9,9,9,10,9,...,6,6,6,6,7,6,5,4,4,9
