In [20]:
import os
import pickle
import pandas as pd

In [27]:
# Read data
data_path = '../data/oulad/action_demograph_OULAD.csv'
oulad = pd.read_csv(data_path)

# Define mappings for categorical columns
mappings = {
    'code_module': {'AAA': 0, 'BBB': 1, 'CCC': 2, 'DDD': 3, 'EEE': 4, 'FFF': 5, 'GGG': 6},
    'gender': {'M': 0, 'F': 1},
    'code_presentation': {'2013J': 0, '2013B': 1, '2014J': 2, '2014B': 3},
    'region': {
        'Scotland': 0, 'East Midlands Region': 1, 'South Region': 2, 'East Anglian Region': 3,
        'West Midlands Region': 4, 'South West Region': 5, 'South East Region': 6, 
        'North Western Region': 7, 'Yorkshire Region': 8, 'London Region': 9, 
        'Ireland': 10, 'North Region': 11
    },
    'highest_education': {
        'HE Qualification': 1, 'A Level or Equivalent': 2, 'Lower Than A Level': 3,
        'Post Graduate Qualification': 4, 'No Formal quals': 5
    },
    'imd_band': {
        '70-80%': 1, '60-70%': 2, '50-60%': 3, '40-50%': 4, '30-40%': 5,
        '20-30%': 6, '10-20': 7, '0-10%': 8, '90-90%': 9, '90-100%': 10
    },
    'age_band': {'0-35': 0, '35-55': 1, '55<=': 2},
    'disability': {'Y': 1, 'N': 0}
}

# Apply mappings to relevant columns
for column, mapping in mappings.items():
    oulad[column] = oulad[column].map(mapping)

# Drop rows with NaN values after mapping
oulad = oulad.dropna()

# Binary classification based on grade
binary_map = lambda x: 0 if x < 0.5 else 1
oulad['labels'] = oulad['grade'].apply(binary_map)

# Drop unnecessary columns
oulad.drop(['grade', 'final_result'], axis=1, inplace=True)

# Define feature columns
feature_columns = ['code_module', 'code_presentation', 'num_of_prev_attempts', 'studied_credits']

# Create data file using column names
data_file = {'data':{}, 'available_demographics': ['gender', 'region', 'age_band', 'disability', 'highest_education']}
for i_row, row in oulad.iterrows():
    data_file['data'][i_row] = {
        'learner_id': int(i_row),
        'features': row[feature_columns].values,
        'gender': int(row['gender']),
        'region': int(row['region']),
        'age_band': int(row['age_band']),
        'disability': int(row['disability']),
        'highest_education': int(row['highest_education']),
        'binary_label': int(row['labels'])
    }

# Save data file
with open('../data/oulad/data_dictionary.pkl', 'wb') as fp:
    pickle.dump(data_file, fp)