In [2]:
from functions import *
from ipynb.fs.full.Student_Info_and_Registration import student_info_reg
from ipynb.fs.full.Assessments import assessments
from ipynb.fs.full.Student_Assessment import student_assessment

---

# Assessments and Student Assessments

---

## Assessments and Student Assessments Merged Dataframe:

Here we will merge the assessments and student assessments dataframes in order to combine our student scores and submission dates with assessment type, date of the assessment, and weight of the assessment.

In [3]:
# merges dataframes student_assessment with assessments with a full outer join on their common ID id_assessment
# creates a column _merge which tells you if the id_assessment was found in one or both dataframes
merged_assessments = student_assessment.merge(assessments, how='outer', on=['id_assessment'] ,indicator=True)
merged_assessments.head()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score,code_module,code_presentation,assessment_type,date,weight,_merge
0,1752,11391,18,0,78,AAA,2013J,TMA,19.0,10.0,both
1,1752,28400,22,0,70,AAA,2013J,TMA,19.0,10.0,both
2,1752,31604,17,0,72,AAA,2013J,TMA,19.0,10.0,both
3,1752,32885,26,0,69,AAA,2013J,TMA,19.0,10.0,both
4,1752,38053,19,0,79,AAA,2013J,TMA,19.0,10.0,both


Our new `_merge` column tells us if the data maps perfectly, or if it is only found on the right or left side, the right side being the assessments dataframe and the left side being the student_assessments dataframe. 

**Rows that do not map:**

In [4]:
missing_exams = merged_assessments.loc[merged_assessments['_merge'] != 'both'].reset_index(drop=True)
missing_exams

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score,code_module,code_presentation,assessment_type,date,weight,_merge
0,15020,653465,12,0,1,,,,,,left_only
1,15020,653541,17,0,100,,,,,,left_only
2,15020,653577,18,0,1,,,,,,left_only
3,15020,653597,17,0,100,,,,,,left_only
4,15020,653633,16,0,100,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
51944,37443,546724,230,0,100,,,,,,left_only
51945,37443,558486,224,0,80,,,,,,left_only
51946,37424,,,,,GGG,2013J,Exam,229.0,100.0,right_only
51947,37434,,,,,GGG,2014B,Exam,222.0,100.0,right_only


In [7]:
md(f'''
These {len(missing_exams)} rows all have entries in the assessments dataframe but have no match in the student_assessment dataframe. 
This indicates that no students in our data took these exams. These will be the missing final exams.
''')


These 51949 rows all have entries in the assessments dataframe but have no match in the student_assessment dataframe. 
This indicates that no students in our data took these exams. These will be the missing final exams.


In [8]:
# remove tests that students did not take
merged_assessments = merged_assessments.dropna(subset=['id_student'])
# drop the merge column since it is no longer of use
# reset the index to be consecutive again
merged_assessments = merged_assessments.drop(columns=['_merge']).reset_index(drop=True)
# order the columns
merged_assessments = merged_assessments[['code_module', 'code_presentation', 'id_student', 'id_assessment', 'assessment_type', 'date_submitted', 'date', 'weight', 'score']]
# make a list of missing exams
missing_exams_list = list(missing_exams['id_assessment'])

**Removing Eliminated Students**

**Merged Assessment/Student_info dataframes**

We will be predicting student assessment scores based on their demographics so we need a dataframe that combines student info with our merged assessments

In [9]:
# merged 'student info/assessments' with a full outer join on their common columns
merged_si_assm = merged_assessments.merge(student_info_reg, how='outer', on=['id_student', 'code_module', 'code_presentation'], indicator=True)
merged_si_assm.head()

Unnamed: 0,code_module,code_presentation,id_student,id_assessment,assessment_type,date_submitted,date,weight,score,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration,_merge
0,AAA,2013J,11391,1752,TMA,18,19.0,10.0,78,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,-159,,both
1,AAA,2013J,11391,1753,TMA,53,54.0,20.0,85,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,-159,,both
2,AAA,2013J,11391,1754,TMA,115,117.0,20.0,80,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,-159,,both
3,AAA,2013J,11391,1755,TMA,164,166.0,20.0,85,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,-159,,both
4,AAA,2013J,11391,1756,TMA,212,215.0,30.0,82,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,-159,,both


In [10]:
stud_assm_uniques = student_assessment['id_student'].nunique()
stud_info_uniques = student_info_reg['id_student'].nunique()
unique_diff = stud_info_uniques - stud_assm_uniques
md(f'''
We found earlier in student assessments that there were only {"{:,}".format(stud_assm_uniques)} unique students with test scores
out of a total of {"{:,}".format(stud_info_uniques)} students in our combined student_info and student_registration dataframe.
This means that there are {unique_diff} students without assessment scores which should be found by locating all of the data with a value of `right_only`
in the `_merge` column.
''')


We found earlier in student assessments that there were only 23,369 unique students with test scores
out of a total of 28,785 students in our combined student_info and student_registration dataframe.
This means that there are 5416 students without assessment scores which should be found by locating all of the data with a value of `right_only`
in the `_merge` column.


**Students without any test scores**:

In [11]:
# variable for where merge is right_only, and only found in student_info
only_student_info = merged_si_assm.loc[merged_si_assm['_merge']=='right_only']
only_student_info

Unnamed: 0,code_module,code_presentation,id_student,id_assessment,assessment_type,date_submitted,date,weight,score,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration,_merge
173912,AAA,2013J,30268,,,,,,,North Western Region,30-40%,35-55,F,A Level or Equivalent,True,Withdrawn,-92,12,right_only
173913,AAA,2013J,135335,,,,,,,East Anglian Region,20-30%,0-35,F,Lower Than A Level,False,Withdrawn,-29,30,right_only
173914,AAA,2013J,281589,,,,,,,North Western Region,30-40%,0-35,M,HE Qualification,False,Fail,-50,,right_only
173915,AAA,2013J,292923,,,,,,,South East Region,90-100%,35-55,F,A Level or Equivalent,False,Withdrawn,-162,-121,right_only
173916,AAA,2013J,305539,,,,,,,Wales,80-90%,0-35,F,Lower Than A Level,False,Withdrawn,-54,-3,right_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182998,GGG,2014J,2640965,,,,,,,Wales,10-20%,0-35,F,Lower Than A Level,False,Fail,-4,,right_only
182999,GGG,2014J,2645731,,,,,,,East Anglian Region,40-50%,35-55,F,Lower Than A Level,False,Distinction,-23,,right_only
183000,GGG,2014J,2648187,,,,,,,South Region,20-30%,0-35,F,A Level or Equivalent,True,Pass,-129,,right_only
183001,GGG,2014J,2679821,,,,,,,South East Region,90-100%,35-55,F,Lower Than A Level,False,Withdrawn,-49,101,right_only


In [12]:
md(f'''
    We have {"{:,}".format(len(only_student_info))} values in only student_info, which means we have students for whom we have no test scores.
    We can drop those which are missing values for the purpose of this dataframe since we are just analyzing test scores
    ''')


    We have 9,091 values in only student_info, which means we have students for whom we have no test scores.
    We can drop those which are missing values for the purpose of this dataframe since we are just analyzing test scores
    

In [13]:
# removing any student with NaN values in id_assessment or final_result
merged_si_assm = merged_si_assm.dropna(subset=['id_assessment', 'final_result'])

In [14]:
# reordering dataframe columns to group like data
# dropping the _merge column in the process
merged_si_assm = merged_si_assm[['code_module', 'code_presentation', 'id_student', 'region', 'imd_band', 'age_band', 'gender', 'highest_education', 'disability', 'final_result', 'id_assessment', 'assessment_type', 'date_submitted', 'date', 'weight', 'score']]
# converting the data types back
merged_si_assm = merged_si_assm.convert_dtypes(convert_integer=False)
# reset the index
merged_si_assm = merged_si_assm.reset_index(drop=True)

In [15]:
merged_si_assm.head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,id_assessment,assessment_type,date_submitted,date,weight,score
0,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,1752,TMA,18,19.0,10.0,78
1,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,1753,TMA,53,54.0,20.0,85
2,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,1754,TMA,115,117.0,20.0,80
3,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,1755,TMA,164,166.0,20.0,85
4,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,1756,TMA,212,215.0,30.0,82


---

## Testing Area

In [16]:
# current
missing_exams_list = list(missing_exams['id_assessment'])
count = 0
stud_missing_assessments = {}
value_list = []
student_list = []
merged_assessments['module_presentation'] = merged_assessments['code_module'] + merged_assessments['code_presentation']
assessments['module_presentation'] = assessments['code_module'] + assessments['code_presentation']


for index, row in merged_assessments['module_presentation'].iteritems():
    module_assessments = assessments.loc[assessments['module_presentation'] == row]
    tests_student_has = list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment'])
    exams_with_scores = tests_student_has + missing_exams_list
    missing_assessments = [x for x in list(module_assessments['id_assessment'].values) if x not in exams_with_scores]
    if len(missing_assessments) != 0:
        stud_missing_assessments[merged_assessments['id_student'][index]] = list(missing_assessments)
        print(f"{len(stud_missing_assessments)} student records checked", end="\r")
    

1869 student records checked

KeyboardInterrupt: 

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight,module_presentation
0,AAA,2013J,1752,TMA,19.0,10.0,AAA2013J
1,AAA,2013J,1753,TMA,54.0,20.0,AAA2013J
2,AAA,2013J,1754,TMA,117.0,20.0,AAA2013J
3,AAA,2013J,1755,TMA,166.0,20.0,AAA2013J
4,AAA,2013J,1756,TMA,215.0,30.0,AAA2013J
5,AAA,2013J,1757,Exam,268.0,100.0,AAA2013J


In [None]:
assessment_date = module_assessments.loc[module_assessments['id_assessment'] == ]
    value_list = [merged_assessments['code_module'][index], merged_assessments['code_presentation'][index], merged_assessments['id_student'][index], module_assessments['id_assessment'][i], merged_assessments['score'][index], merged_assessments['date_submitted'][index], module_assessments['assessment_type'][i], module_assessments['date'][i], module_assessments['weight'][i], 0]
    my_list.append(value_list)

In [26]:
my_list.loc[my_list[2] == '186780']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1262,BBB,2013B,186780,14991,0,77,CMA,54.0,1.0,0
1263,BBB,2013B,186780,14992,0,77,CMA,89.0,1.0,0
1264,BBB,2013B,186780,14993,0,77,CMA,124.0,1.0,0
1265,BBB,2013B,186780,14994,0,77,CMA,159.0,1.0,0
1266,BBB,2013B,186780,14995,0,77,CMA,187.0,1.0,0
1267,BBB,2013B,186780,14986,0,77,TMA,89.0,18.0,0
1268,BBB,2013B,186780,14987,0,77,TMA,124.0,18.0,0
1269,BBB,2013B,186780,14988,0,77,TMA,159.0,18.0,0
1270,BBB,2013B,186780,14989,0,77,TMA,187.0,18.0,0
4850,BBB,2013B,186780,14991,0,77,CMA,54.0,1.0,0


In [25]:
get_dupes(my_list)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
4850,BBB,2013B,186780,14991,0,77,CMA,54.0,1.0,0
4851,BBB,2013B,186780,14992,0,77,CMA,89.0,1.0,0
4852,BBB,2013B,186780,14993,0,77,CMA,124.0,1.0,0
4853,BBB,2013B,186780,14994,0,77,CMA,159.0,1.0,0
4854,BBB,2013B,186780,14995,0,77,CMA,187.0,1.0,0
4855,BBB,2013B,186780,14986,0,77,TMA,89.0,18.0,0
4856,BBB,2013B,186780,14987,0,77,TMA,124.0,18.0,0
4857,BBB,2013B,186780,14988,0,77,TMA,159.0,18.0,0
4858,BBB,2013B,186780,14989,0,77,TMA,187.0,18.0,0
5170,BBB,2013B,386348,14987,78,-1,TMA,124.0,18.0,0


In [None]:
my_list[4930:]

In [None]:
pd.DataFrame(my_list)

In [None]:
missing_exams_list = list(missing_exams['id_assessment'])
couant = 0
my_list = []

merged_assessments['module_presentation'] = merged_assessments['code_module'] + merged_assessments['code_presentation']
assessments['module_presentation'] = assessments['code_module'] + assessments['code_presentation']

print(start)

for index, row in merged_assessments['module_presentation'].iteritems():
    for i, r in assessments['module_presentation'].iteritems():
        if row == r:
            tests_student_has = list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment'])
            bad_lists = tests_student_has+missing_exams_list
            if assessments['id_assessment'][i] not in bad_lists:
                value_dict = {'code_module':merged_assessments['code_module'][index], 'code_presentation':merged_assessments['code_presentation'][index], 'id_student':merged_assessments['id_student'][index], 'id_assessment':assessments['id_assessment'][i], 'score':merged_assessments['score'][index], 'date_submitted':merged_assessments['date_submitted'][index],'assessment_type':assessments['assessment_type'][i], 'date':assessments['date'][i], 'weight':assessments['weight'][i]}
    my_list.append(value_dict)
    print(f"{len(my_list)}/infinite rows appended", end="\r")
print(time.process_time() - start)

In [None]:
#good

merged_assessments['module_presentation'] = merged_assessments['code_module'] + merged_assessments['code_presentation']
assessments['module_presentation'] = assessments['code_module'] + assessments['code_presentation']
start = time.process_time()
missing_exams_list = list(missing_exams['id_assessment'])
count = 0

for index, row in merged_assessments.iterrows():
    for i, r in assessments.iterrows():
        if assessments['module_presentation'][i] == merged_assessments['module_presentation'][index]:
            tests_student_has = list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment'])
            bad_lists = tests_student_has+missing_exams_list
            if assessments['id_assessment'][i] not in bad_lists:
                value_series = pd.Series([merged_assessments['code_module'][index], merged_assessments['code_presentation'][index], merged_assessments['id_student'][index], assessments['id_assessment'][i], merged_assessments['score'][index], merged_assessments['date_submitted'][index], assessments['assessment_type'][i], assessments['date'][i], assessments['weight'][i]])
                new_df = new_df.append(value_series, ignore_index=True)
                print(f"{len(new_df)}/ rows appended", end="\r")
                continue

In [None]:
pd.DataFrame(my_list)[0:50]

In [None]:
start = time.process_time()
missing_exams_list = list(missing_exams['id_assessment'])
count = 0
my_list = []
d = {}
print(start)
for index, row in merged_assessments.iterrows():
    for i, r in assessments.iterrows():
        if assessments['code_module'][i] == merged_assessments['code_module'][index] and assessments['code_presentation'][i] == merged_assessments['code_presentation'][index]:
            value_dict = {'code_module':merged_assessments['code_module'][index], 'code_presentation':merged_assessments['code_presentation'][index], 'id_student':merged_assessments['id_student'][index], 'id_assessment':assessments['id_assessment'][i], 'score':merged_assessments['score'][index], 'date_submitted':merged_assessments['date_submitted'][index],'assessment_type':assessments['assessment_type'][i], 'date':assessments['date'][i], 'weight':assessments['weight'][i]}
            tests_student_has = list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment'])
            if assessments['id_assessment'][i] not in (tests_student_has):
                    if assessments['id_assessment'][i] not in (missing_exams_list):
                            d[count] = value_dict
                            count+=1
                            print(f"{len(d)}/ rows appended", end="\r")
                            continue
print(time.process_time() - start)

In [None]:
start = time.process_time()
missing_exams_list = list(missing_exams['id_assessment'])
count = 0
my_list = []
d = {}
print(start)
for index, row in merged_assessments.iterrows():
    for i, r in assessments.iterrows():
        if assessments['code_module'][i] == merged_assessments['code_module'][index] and assessments['code_presentation'][i] == merged_assessments['code_presentation'][index]:
            value_dict = {'code_module':merged_assessments['code_module'][index], 'code_presentation':merged_assessments['code_presentation'][index], 'id_student':merged_assessments['id_student'][index], 'id_assessment':assessments['id_assessment'][i], 'score':merged_assessments['score'][index], 'date_submitted':merged_assessments['date_submitted'][index],'assessment_type':assessments['assessment_type'][i], 'date':assessments['date'][i], 'weight':assessments['weight'][i]}
            tests_student_has = list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment'])
            if assessments['id_assessment'][i] not in (tests_student_has):
                    if assessments['id_assessment'][i] not in (missing_exams_list):
                            d[count] = value_dict
                            count+=1
                            print(f"{len(d)}/ rows appended", end="\r")
                            continue
print(time.process_time() - start)

In [None]:
d = {}
count = 0

start = time.process_time()
missing_exams_list = list(missing_exams['id_assessment'])
my_list = []
d = {}
print(start)
for index, row in merged_assessments.iterrows():
    for i, r in assessments.iterrows():
        if assessments['code_module'][i] == merged_assessments['code_module'][index] and assessments['code_presentation'][i] == merged_assessments['code_presentation'][index]:
            value_dict = {'code_module':merged_assessments['code_module'][index], 'code_presentation':merged_assessments['code_presentation'][index], 'id_student':merged_assessments['id_student'][index], 'id_assessment':assessments['id_assessment'][i], 'score':merged_assessments['score'][index], 'date_submitted':merged_assessments['date_submitted'][index],'assessment_type':assessments['assessment_type'][i], 'date':assessments['date'][i], 'weight':assessments['weight'][i]}
            tests_student_has = list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment'])
            if assessments['id_assessment'][i] not in (tests_student_has):
                    if assessments['id_assessment'][i] not in (missing_exams_list):
                            d[count] = value_dict
                            count+=1
                            print(f"{len(d)}/ rows appended", end="\r")
                            continue
print(time.process_time() - start)




In [None]:
# the dictionary to pass to pandas dataframe
d = {}

# a counter to use to add entries to "dict"
i = 0 

# Example data to loop and append to a dataframe
data = [{"foo": "foo_val_1", "bar": "bar_val_1"}, 
       {"foo": "foo_val_2", "bar": "bar_val_2"}]

# the loop
for entry in data:

    # add a dictionary entry to the final dictionary
    d[i] = {"col_1_title": entry['foo'], "col_2_title": entry['bar']}
    
    # increment the counter
    i = i + 1

# create the dataframe using 'from_dict'
# important to set the 'orient' parameter to "index" to make the keys as rows
df = DataFrame.from_dict(d, "index")

In [None]:
missing_exams_list = list(missing_exams['id_assessment'])
count = 0
while count < 1:
    for index, row in merged_assessments.iterrows():
        for i, r in assessments.iterrows():
            if assessments['code_module'][i] == merged_assessments['code_module'][index]:
                 if assessments['code_presentation'][i] == merged_assessments['code_presentation'][index]:
                        value_dict = {'code_module':merged_assessments['code_module'][index], 'code_presentation':merged_assessments['code_presentation'][index], 'id_student':merged_assessments['id_student'][index], 'id_assessment':assessments['id_assessment'][i], 'score':merged_assessments['score'][index], 'date_submitted':merged_assessments['date_submitted'][index],'assessment_type':assessments['assessment_type'][i], 'date':assessments['date'][i], 'weight':assessments['weight'][i]}
                        tests_student_has = list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment'])
                        if assessments['id_assessment'][i] not in (tests_student_has):
                            if assessments['id_assessment'][i] not in (missing_exams_list):
                                print(assessments['id_assessment'][i], merged_assessments['id_student'][index])
                                continue

In [None]:
count = 0
while count < 1:
    for index, row in merged_assessments.iterrows():
        for i, r in assessments.iterrows():
            if assessments['code_module'][i] == merged_assessments['code_module'][index]:
                 if assessments['code_presentation'][i] == merged_assessments['code_presentation'][index]:
                        value_dict = {'code_module':merged_assessments['code_module'][index], 'code_presentation':merged_assessments['code_presentation'][index], 'id_student':merged_assessments['id_student'][index], 'id_assessment':assessments['id_assessment'][i], 'score':merged_assessments['score'][index], 'date_submitted':merged_assessments['date_submitted'][index],'assessment_type':assessments['assessment_type'][i], 'date':assessments['date'][i], 'weight':assessments['weight'][i]}
                        if assessments['id_assessment'][i] not in list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment']):
                            new_df = new_df.append(value_dict, ignore_index=True)
                            print(f"{len(new_df)}/ rows appended", end="\r")
                            count += 1
                            continue

In [None]:
for index, row in student_info.iterrows():
     for i, r in assessments.iterrows():
            if assessments['code_module'][i] == student_info['code_module'][index]:
                 if assessments['code_presentation'][i] == student_info['code_presentation'][index]:
                        if int(assessments['id_assessment'][i]) not in list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment']):
                            values = [student_info['code_module'][index], student_info['code_presentation'][index], student_info['id_student'][index], assessments['id_assessment'][i], assessments['assessment_type'][i], assessments['weight'][i]]
                            for m in list(merged_assessments.columns):
                                for n in values:
                                    new_df[]

In [None]:
'''count = 0
while count < 1:
    for index, row in student_info.iterrows():
        for i, r in assessments.iterrows():
            if assessments['code_module'][i] == student_info['code_module'][index]:
                 if assessments['code_presentation'][i] == student_info['code_presentation'][index]:
                        if assessments['id_assessment'][i] not in list(merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment']):
                            new_df = new_df.append([[student_info['code_module'][index], student_info['code_presentation'][index], student_info['id_student'][index], assessments['id_assessment'][i], assessments['assessment_type'][i], assessments['weight'][i]]], ignore_index=True)
                            print(f"{len(new_df)}/ rows appended", end="\r")
                            count += 1
                            continue'''

In [None]:
pd.DataFrame(merged_assessments.loc[merged_assessments['id_student'] == 11391, 'id_assessment'])

**Updated Dataframe**

**Size**

In [None]:
md(f'''* Number of Rows: {len(merged_assessments)}
* Number of Columns: {len(merged_assessments.columns)}''')

**Data Types**

In [None]:
merged_assessments.dtypes

* id_student and id_assessments are both categorical values and so should be converted to objects

In [None]:
# converting the data types
merged_assessments = merged_assessments.astype({'id_assessment': int, 'id_student': int})
merged_assessments = merged_assessments.astype({'id_assessment': object, 'id_student': object})

**Null Values**

In [None]:
# prints the sum of a columns null value
merged_assessments.isnull().sum()

* We have 2,873 null data points for assessment date. The documentation of this dataset states that if the exam date is missing then it is as the end of the last presentation week. We can find this information in the courses dataframe.

In [None]:
# adding the dates for the null test dates
for index, row in merged_assessments[merged_assessments['date'].isna()].iterrows():
    merged_assessments.at[index, 'date'] = courses.loc[(courses['code_module'] == row['code_module']) & (courses['code_presentation'] == row['code_presentation']), 'module_presentation_length']

# reprinting to ensure it worked
merged_assessments.isnull().sum()

* There are 173 null values for score. These records are, unfortunately not of much interest to us, since score is what we are trying to find the relationship for, and so we will discard them. This leaves us with no null data in assessments.

In [None]:
# removes any entry where the score is NaN
merged_assessments = merged_assessments.dropna(subset=['score'])

# reprinting to ensure it worked
merged_assessments.isnull().sum()

**Unique Counts**

In [None]:
merged_assessments.nunique()

**Unique Categorical Values**

In [None]:
unique_vals(merged_assessments)

**Duplicate Values:**

In [None]:
duplicate_vals(merged_assessments)

**Statistics**

In [None]:
merged_assessments.describe()

In [None]:
merged_assessments

In [None]:
assessments

In [None]:
merged_assessments.loc[merged_assessments['id_student'] == 11391, 'id_assessment']

if a test id is in assessments in the same code module and presentation as a student is in:
    if the test is already in the dataframe under that student id:
        do nothing
    else:
        add the test with all the same student information, the assessment id, type, and weight to the dataframe

In [None]:
count = 0

# iterate through merged_assessments dataframe
for i, r in assessments.iterrows():
    for index, row in merged_assessments.iterrows():
    # iterate through assessments dataframe
    
        # if the code module in merged_assessments is the same as the code_module in assessments
        # convert to strings to compare
        if str(merged_assessments['code_module'][index]) == str(assessments['code_module'][i]):
            # if the code presentations are also the same
            if str(merged_assessments['code_presentation'][index]) == str(assessments['code_presentation'][i]):
                # if the assessment id is not found under that student append another row with that students information and the test they are missing
                if assessments['id_assessment'][i] not in merged_assessments.loc[merged_assessments['id_student'] == merged_assessments['id_student'][index], 'id_assessment']:
                    merged_assessments = merged_assessments.append([merged_assessments['code_module'][index], merged_assessments['code_presentation'][index], merged_assessments['id_student'][index], merged_assessments['region'][index], merged_assessments['imd_band'][index], merged_assessments['age_band'][index], merged_assessments['gender'][index], merged_assessments['highest_education'][index], merged_assessments['disability'][index], assessments['id_assessment'][i], assessments['assessment_type'][i], assessments['weight'][i]])
                    count += 1
                    print(f"{count} rows appended", end="\r")

In [None]:
aggregates = { 'assessment_type':'first','weight':'sum'}
assessments.groupby(['code_module','code_presentation']).aggregate(aggregates).reset_index()

In [None]:

for index, row in merged_assessments.iterrows():
    for i, r in assessments.iterrows():
        if merged_assessments['code_module'][index] == assessments['code_module'][i]:
            if merged_assessments['code_presentation'][index] == assessments['code_presentation'][i]:
                merged_assessments.append([merged_assessments['code_module'][index], merged_assessments['code_presentation'][index], merged_assessments['id_student'][index], merged_assessments['region'][index], merged_assessments['imd_band'][index], merged_assessments['age_band'][index], merged_assessments['gender'][index], merged_assessments['highest_education'][index], merged_assessments['disability'][index], assessments['id_assessment'][i], assessments['assessment_type'][i], assessments['weight'][i]])
                
                

In [None]:
for index, row in merged_assessments.loc[merged_assessments['final_result'] == 'Withdrawn'].iterrows():
    for i, r in assessments.iterrows():
        if merged_assessments['code_module'][index] == assessments['code_module'][i]:
            if merged_assessments['code_presentation'][index] == assessments['code_presentation'][i]:
                merged_assessments.append([merged_assessments['code_module'][index], merged_assessments['code_presentation'][index], merged_assessments['id_student'][index], merged_assessments['region'][index], merged_assessments['imd_band'][index], merged_assessments['age_band'][index], merged_assessments['gender'][index], merged_assessments['highest_education'][index], merged_assessments['disability'][index], assessments['id_assessment'][i], assessments['assessment_type'][i], assessments['weight'][i]])

In [None]:
pd.concat(x for _, x in merged_assessments.groupby("id_assessment") if len(x) > 1).head()

In [None]:
for i, r in assessments[student_info.loc['final_result'] == 'Withdrawn'].iterrows():
        merged_assessments.append(courses.loc[(courses['code_module'] == row['code_module']) & (courses['code_presentation'] == row['code_presentation']), 'module_presentation_length']

In [None]:
for index, row in student_info.iterrows():
    for i, r in assessments.iterrows():
        if student_info['code_module'][index] == assessments['code_module'][i]:
            if student_info['code_presentation'][index] == assessments['code_presentation'][i]:
                new_df = new_df.append([student_info['code_module'][index], student_info['code_presentation'][index], student_info['id_student'][index], assessments['id_assessment'][i], assessments['assessment_type'][i], assessments['weight'][i]])
                print(f"{len(new_df)}/ rows appended", end="\r")