In [1]:
# Set up
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # plotting
from sklearn import metrics

from sklearn.model_selection import train_test_split # splitting data
from sklearn.metrics import mean_squared_error, r2_score # for evaluating

# Our regressors
import statsmodels.api as sm 
import statsmodels.formula.api as smf # linear modeling (OLS and NBR)
from sklearn.cross_decomposition import PLSRegression #PLS
from sklearn.neighbors import KNeighborsRegressor  #KNN
from sklearn.linear_model import LinearRegression  #OLS Maybe
from sklearn.linear_model import Ridge #RR

# For Feature Selection
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import make_pipeline           # for making pipelines
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures # Add a polynomial transformation to the pipeline

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# Loading Data
assessment = pd.read_csv("data_CCC_2014J/assessments_CCC_2014J.csv")
studentAssessment= pd.read_csv("data_CCC_2014J/studentAssessment_CCC_2014J.csv")
studentInfo = pd.read_csv("data_CCC_2014J/studentInfo_CCC_2014J.csv")
studentVle = pd.read_csv("data_CCC_2014J/studentVle_CCC_2014J.csv")
vle = pd.read_csv("data_CCC_2014J/vle_CCC_2014J.csv")
studentRegistration_filtered  = pd.read_csv("data_CCC_2014J/studentRegistration_CCC_2014J.csv")

In [3]:
studentInfo = studentInfo[studentInfo.code_module == 'CCC']

In [4]:
#studentAssessment.id_assessment.unique()
#studentAssessment[studentAssessment.id_assessment == 1758]
assessments_score = studentAssessment.pivot_table('score', ['id_student'], 'id_assessment')
assessments_score.head()

id_assessment,24291,24292,24293,24294,24295,24296,24297,24298,24299
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
23698,94.0,60.0,69.0,67.0,78.0,93.0,73.0,56.0,80.0
25261,,,,,78.0,,,,
27116,90.0,95.0,92.0,84.0,89.0,100.0,98.0,96.0,96.0
28787,,,,,89.0,60.0,54.0,21.0,44.0
28952,52.0,,,,67.0,,,,


In [5]:
new_assessment = assessment.drop(['code_module', 'code_presentation', 'assessment_type', 'date'], axis=1 )
new_assessment.head()
new_student_assessment = studentAssessment.merge(new_assessment, on='id_assessment', how='left')

In [6]:
new_student_assessment['weighted_score'] = new_student_assessment.score * new_student_assessment.weight
new_student_assessment.head()

new_student_assessment.id_assessment.unique()
assessments_score = new_student_assessment.pivot_table(['weighted_score', 'score'], ['id_student'], 'id_assessment')
assessments_score.head()

Unnamed: 0_level_0,score,score,score,score,score,score,score,score,score,weighted_score,weighted_score,weighted_score,weighted_score,weighted_score,weighted_score,weighted_score,weighted_score,weighted_score
id_assessment,24291,24292,24293,24294,24295,24296,24297,24298,24299,24291,24292,24293,24294,24295,24296,24297,24298,24299
id_student,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
23698,94.0,60.0,69.0,67.0,78.0,93.0,73.0,56.0,80.0,846.0,1320.0,1518.0,1474.0,156.0,651.0,584.0,448.0,8000.0
25261,,,,,78.0,,,,,,,,,156.0,,,,
27116,90.0,95.0,92.0,84.0,89.0,100.0,98.0,96.0,96.0,810.0,2090.0,2024.0,1848.0,178.0,700.0,784.0,768.0,9600.0
28787,,,,,89.0,60.0,54.0,21.0,44.0,,,,,178.0,420.0,432.0,168.0,4400.0
28952,52.0,,,,67.0,,,,,468.0,,,,134.0,,,,


In [7]:
mergedScores = studentInfo.merge(assessments_score, on='id_student')
mergedScores.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,...,"(score, 24299)","(weighted_score, 24291)","(weighted_score, 24292)","(weighted_score, 24293)","(weighted_score, 24294)","(weighted_score, 24295)","(weighted_score, 24296)","(weighted_score, 24297)","(weighted_score, 24298)","(weighted_score, 24299)"
0,CCC,2014J,23698,F,East Anglian Region,A Level or Equivalent,50-60%,0-35,0,120,...,80.0,846.0,1320.0,1518.0,1474.0,156.0,651.0,584.0,448.0,8000.0
1,CCC,2014J,25261,F,Scotland,HE Qualification,40-50%,0-35,0,60,...,,,,,,156.0,,,,
2,CCC,2014J,27116,M,Yorkshire Region,HE Qualification,70-80%,0-35,0,120,...,96.0,810.0,2090.0,2024.0,1848.0,178.0,700.0,784.0,768.0,9600.0
3,CCC,2014J,28787,M,South East Region,Lower Than A Level,80-90%,35-55,0,60,...,44.0,,,,,178.0,420.0,432.0,168.0,4400.0
4,CCC,2014J,28952,F,Yorkshire Region,Lower Than A Level,60-70%,0-35,0,60,...,,468.0,,,,134.0,,,,


In [8]:
studentVle.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,CCC,2014J,582530,909232,-18,5
1,CCC,2014J,582530,909230,-18,1
2,CCC,2014J,582530,909071,-18,7
3,CCC,2014J,582530,909231,-18,1
4,CCC,2014J,582530,909013,-18,10


In [9]:
new_student_vle = studentVle.merge(vle, on='id_site', how='left')
new_student_vle

new_student_vle = new_student_vle.groupby(['id_student', 'activity_type']).sum()

activity_clicks = new_student_vle.pivot_table('sum_click', ['id_student'], 'activity_type')
activity_clicks.head()

activity_type,forumng,homepage,oucollaborate,oucontent,page,quiz,resource,subpage,url
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
23698,63.0,121.0,,4.0,1.0,576.0,42.0,98.0,5.0
25261,506.0,174.0,,43.0,,117.0,36.0,118.0,3.0
27116,407.0,401.0,52.0,269.0,2.0,448.0,104.0,158.0,6.0
28787,123.0,144.0,,10.0,4.0,254.0,25.0,82.0,3.0
28952,6.0,21.0,,2.0,,59.0,2.0,12.0,3.0


In [14]:
mergedActivity= mergedScores.merge(activity_clicks, on='id_student', how='left')
mergedActivity.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,...,"(weighted_score, 24299)",forumng,homepage,oucollaborate,oucontent,page,quiz,resource,subpage,url
0,CCC,2014J,23698,F,East Anglian Region,A Level or Equivalent,50-60%,0-35,0,120,...,8000.0,63.0,121.0,,4.0,1.0,576.0,42.0,98.0,5.0
1,CCC,2014J,25261,F,Scotland,HE Qualification,40-50%,0-35,0,60,...,,506.0,174.0,,43.0,,117.0,36.0,118.0,3.0
2,CCC,2014J,27116,M,Yorkshire Region,HE Qualification,70-80%,0-35,0,120,...,9600.0,407.0,401.0,52.0,269.0,2.0,448.0,104.0,158.0,6.0
3,CCC,2014J,28787,M,South East Region,Lower Than A Level,80-90%,35-55,0,60,...,4400.0,123.0,144.0,,10.0,4.0,254.0,25.0,82.0,3.0
4,CCC,2014J,28952,F,Yorkshire Region,Lower Than A Level,60-70%,0-35,0,60,...,,6.0,21.0,,2.0,,59.0,2.0,12.0,3.0


In [11]:
studentRegistration_filtered = studentRegistration_filtered[studentRegistration_filtered.code_module == 'CCC']
studentRegistration_filtered['date_unregistration'] = studentRegistration_filtered.date_unregistration.fillna(269)
studentRegistration_filtered = studentRegistration_filtered.drop(['code_module', 'code_presentation'], axis=1 )

studentRegistration_filtered.head()

Unnamed: 0,id_student,date_registration,date_unregistration
0,23698,-110.0,269.0
1,25261,-114.0,51.0
2,27116,-156.0,269.0
3,28787,-29.0,269.0
4,28952,-50.0,12.0


In [15]:
mergedResgistration = mergedActivity.merge(studentRegistration_filtered, on='id_student')
mergedResgistration.columns

Index([            'code_module',       'code_presentation',
                    'id_student',                  'gender',
                        'region',       'highest_education',
                      'imd_band',                'age_band',
          'num_of_prev_attempts',         'studied_credits',
                    'disability',            'final_result',
                ('score', 24291),          ('score', 24292),
                ('score', 24293),          ('score', 24294),
                ('score', 24295),          ('score', 24296),
                ('score', 24297),          ('score', 24298),
                ('score', 24299), ('weighted_score', 24291),
       ('weighted_score', 24292), ('weighted_score', 24293),
       ('weighted_score', 24294), ('weighted_score', 24295),
       ('weighted_score', 24296), ('weighted_score', 24297),
       ('weighted_score', 24298), ('weighted_score', 24299),
                       'forumng',                'homepage',
                 'oucoll