In [1]:
import os
import pickle
import pandas as pd

In [2]:
# Read data
data_path = '../data/student-performance-por/student-por.csv'
studentPerf = pd.read_csv(data_path, delimiter = ';')
print(studentPerf.head())

binary_map = {**dict.fromkeys(range(0, 11), 0), **dict.fromkeys(range(11, 21), 1)}
studentPerf['labels'] = studentPerf['G3'].map(binary_map)
studentPerf.drop('G3', axis=1, inplace=True)

# Define mappings for categorical columns
mappings = {
    'school': {'GP': 0, 'MS': 1},
    'sex': {'M': 0, 'F': 1},
    'age': {15: 0, 16: 1, 17: 2, 18: 3, 19: 4, 20: 5, 21: 6, 22: 7},
    'address': {'U': 0, 'R': 1},
    'famsize': {'LE3': 0, 'GT3': 1},  # Fixed the value for 'GT3' to 1
    'Pstatus': {'T': 0, 'A': 1},
    'Mjob': {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4},
    'Fjob': {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4},
    'reason': {'home': 0, 'reputation': 1, 'course': 2, 'other': 3},
    'guardian': {'mother': 0, 'father': 1, 'other': 2},
    'schoolsup': {'yes': 0, 'no': 1},
    'famsup': {'yes': 0, 'no': 1},
    'paid': {'yes': 0, 'no': 1},
    'activities': {'yes': 0, 'no': 1},
    'nursery': {'yes': 0, 'no': 1},
    'higher': {'yes': 0, 'no': 1},
    'internet': {'yes': 0, 'no': 1},
    'romantic': {'yes': 0, 'no': 1},
}

# Apply mappings to relevant columns
for column, mapping in mappings.items():
    studentPerf[column] = studentPerf[column].map(mapping)


##
# Create data file
data_file = {'data':{}, 'available_demographics': ['age', 'sex', 'famsize', 'famsup',  'higher']}
features_indices = [i+1 for i in range(20)]
for i_row, row in studentPerf.iterrows():
    data_file['data'][i_row] = {
        'learner_id': i_row,
        'features': row[features_indices].values,
        'sex': row[1],
        'age': row[2],
        'famsize': row[4],
        'famsup': row[16],
        'higher': row[21],
        'binary_label': int(row['labels'])
    }

# Save data file
with open('../data/student-performance-por/data_dictionary.pkl', 'wb') as fp:
    pickle.dump(data_file, fp)

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        4   0  11  11  
1      5        3      3     1     1      3        2   9  11  11  
2      4        3      2     2     3      3        6  12  13  12  
3      3        2      2     1     1      5        0  14  14  14  
4      4        3      2     1     2      5        0  11  13  13  

[5 rows x 33 columns]


  'features': row[features_indices].values,
  'sex': row[1],
  'age': row[2],
  'famsize': row[4],
  'famsup': row[16],
  'higher': row[21],
