# Split data into train & test sets

In [3]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
_course = 'ca116'

In [5]:
_years = [
    (2016, 2017),
    (2017, 2018),
    (2018, 2019),
]

In [6]:
exams = [4, 8, 12]

In [7]:
THRESHOLD = 40

In [8]:
SPLIT = 0.2

In [9]:
for _year in _years:
    filename = '../../data/features/features_%s_%s.json' % (_course.upper(), _year[1])
    !ls -l "$filename"

-rw-r--r--  1 dazconap  staff  1711140 Apr  9 19:53 ../../data/features/features_CA116_2017.json
-rw-r--r--  1 dazconap  staff  2227002 Apr  9 20:10 ../../data/features/features_CA116_2018.json
-rw-r--r--  1 dazconap  staff  1844783 Apr  9 21:53 ../../data/features/features_CA116_2019.json


In [10]:
students = {}
features = {}

for _year in _years:
    filename = '../../data/features/features_%s_%s.json' % (_course.upper(), _year[1])
    with open(filename) as f:
        data = json.load(f)
    features[_year] = data
    students[_year] = set([ feature['student'] for feature in data ])

In [11]:
for _year in _years:
    print('Year: {}, Students: {}, Features: {}'.format(_year, len(students[_year]), len(features[_year])))

Year: (2016, 2017), Students: 126, Features: 1512
Year: (2017, 2018), Students: 156, Features: 1872
Year: (2018, 2019), Students: 130, Features: 1560


In [12]:
training_and_validation_years = [
    (2016, 2017),
    (2017, 2018)
]

test_year = (2018, 2019)

In [11]:
for _exam in exams:

    print('Exam: {}'.format(_exam))
    
    X_train = []
    X_val = []
    
    for _year in training_and_validation_years:
    
        df = pd.DataFrame(features[_year])
        
        # print(df.head(2))
    
        dataframe = df[(df['exam'] == _exam) & (df['week'] == _exam)][[
            'student', 'grade', 'academic_year_0', 'academic_year_1']]

        # print(dataframe.head(2))
        
        X = list(dataframe['student'])
        y = list(dataframe['grade'] < THRESHOLD)
        
        # TRAIN / TEST split
        _X_train, _X_val, _y_train, _y_val = train_test_split(X, y, test_size=SPLIT, random_state=42)

        print('Year: {}, Exam Week: {}, Train: {}-{} Validation: {}-{}'.format(
            _year, _exam, len(_X_train), len(_y_train), len(_X_val), len(_y_val)))
        
        X_train.extend(_X_train)
        X_val.extend(_X_val)
        
    np.save('../../data/features/X_train_{}.npy'.format(_exam), X_train)
    np.save('../../data/features/X_val_{}.npy'.format(_exam), X_val)

Exam: 4
Year: (2016, 2017), Exam Week: 4, Train: 100-100 Validation: 26-26
Year: (2017, 2018), Exam Week: 4, Train: 124-124 Validation: 32-32
Exam: 8
Year: (2016, 2017), Exam Week: 8, Train: 100-100 Validation: 26-26
Year: (2017, 2018), Exam Week: 8, Train: 124-124 Validation: 32-32
Exam: 12
Year: (2016, 2017), Exam Week: 12, Train: 100-100 Validation: 26-26
Year: (2017, 2018), Exam Week: 12, Train: 124-124 Validation: 32-32


In [14]:
for _exam in exams:

    print('Exam: {}'.format(_exam))
    
    df = pd.DataFrame(features[test_year])

    dataframe = df[(df['exam'] == _exam) & (df['week'] == _exam)][[
        'student', 'grade', 'academic_year_0', 'academic_year_1']]

    X_test = list(dataframe['student'])
    y_test = list(dataframe['grade'] < THRESHOLD)
        
    np.save('../../data/features/X_test_{}.npy'.format(_exam), X_test)
    np.save('../../data/features/y_test_{}.npy'.format(_exam), y_test)

Exam: 4
Exam: 8
Exam: 12
