#[CSEDM Data Challenge](https://sites.google.com/ncsu.edu/csedm-dc-2021/home)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os
from os import path

In [None]:
#@title ProgSnap Code
progsnap = "" #@param {type:"string"}
import pandas as pd
import os
from os import path


class PS2:
    """ A class holding constants used to get columns of a PS2 dataset
    """

    Order = 'Order'
    SubjectID = 'SubjectID'
    ToolInstances = 'ToolInstances'
    ServerTimestamp = 'ServerTimestamp'
    ServerTimezone = 'ServerTimezone'
    CourseID = 'CourseID'
    CourseSectionID = 'CourseSectionID'
    AssignmentID = 'AssignmentID'
    ProblemID = 'ProblemID'
    Attempt = 'Attempt'
    CodeStateID = 'CodeStateID'
    IsEventOrderingConsistent = 'IsEventOrderingConsistent'
    EventType = 'EventType'
    Score = 'Score'
    CompileResult = 'CompileResult'
    CompileMessageType = 'CompileMessageType'
    CompileMessageData = 'CompileMessageData'
    EventID = 'EventID'
    ParentEventID = 'ParentEventID'
    SourceLocation = 'SourceLocation'
    Code = 'Code'

    Version = 'Version'
    IsEventOrderingConsistent = 'IsEventOrderingConsistent'
    EventOrderScope = 'EventOrderScope'
    EventOrderScopeColumns = 'EventOrderScopeColumns'
    CodeStateRepresentation = 'CodeStateRepresentation'


class ProgSnap2Dataset:

    MAIN_TABLE_FILE = 'MainTable.csv'
    METADATA_TABLE_FILE = 'DatasetMetadata.csv'
    LINK_TABLE_DIR = 'LinkTables'
    CODE_STATES_DIR = 'CodeStates'
    CODE_STATES_TABLE_FILE = os.path.join(CODE_STATES_DIR, 'CodeStates.csv')

    def __init__(self, directory):
        self.directory = directory
        self.main_table = None
        self.metadata_table = None
        self.code_states_table = None

    def path(self, local_path):
        return path.join(self.directory, local_path)

    def get_main_table(self):
        """ Returns a Pandas DataFrame with the main event table for this dataset
        """
        if self.main_table is None:
            self.main_table = pd.read_csv(self.path(ProgSnap2Dataset.MAIN_TABLE_FILE))
            if self.get_metadata_property(PS2.IsEventOrderingConsistent):
                order_scope = self.get_metadata_property(PS2.EventOrderScope)
                if order_scope == 'Global':
                    # If the table is globally ordered, sort it
                    self.main_table.sort_values(by=[PS2.Order], inplace=True)
                elif order_scope == 'Restricted':
                    # If restricted ordered, sort first by grouping columns, then by order
                    order_columns = self.get_metadata_property(PS2.EventOrderScopeColumns)
                    if order_columns is None or len(order_columns) == 0:
                        raise Exception('EventOrderScope is restricted by no EventOrderScopeColumns given')
                    columns = order_columns.split(';')
                    columns.append('Order')
                    # The result is that _within_ these groups, events are ordered
                    self.main_table.sort_values(by=columns, inplace=True)
        return self.main_table.copy()

    def set_main_table(self, main_table):
        """ Overwrites the main table loaded from the file with the provided table.
        This this table will be used for future operations, including copying the dataset.
        """
        self.main_table = main_table.copy()

    def get_code_states_table(self):
        """ Returns a Pandas DataFrame with the code states table form this dataset
        """
        if self.code_states_table is None:
            self.code_states_table = pd.read_csv(self.path(ProgSnap2Dataset.CODE_STATES_TABLE_FILE))
        return self.code_states_table.copy()

    def get_metadata_property(self, property):
        """ Returns the value of a given metadata property in the metadata table
        """
        if self.metadata_table is None:
            self.metadata_table = pd.read_csv(self.path(ProgSnap2Dataset.METADATA_TABLE_FILE))

        values = self.metadata_table[self.metadata_table['Property'] == property]['Value']
        if len(values) == 1:
            return values.iloc[0]
        if len(values) > 1:
            raise Exception('Multiple values for property: ' + property)

        # Default return values as of V6
        if property == PS2.IsEventOrderingConsistent:
            return False
        if property == PS2.EventOrderScope:
            return 'None'
        if property == PS2.EventOrderScopeColumns:
            return ''

        return None

    def __link_table_path(self):
        return self.path(ProgSnap2Dataset.LINK_TABLE_DIR)

    def list_link_tables(self):
        """ Returns a list of the link tables in this dataset, which can be loaded with load_link_table
        """
        path = self.__link_table_path()
        dirs = os.listdir(path)
        return [f for f in dirs if os.path.isfile(os.path.join(path, f)) and f.endswith('.csv')]

    def load_link_table(self, link_table):
        """ Returns a Pandas DataFrame with the link table with the given name
        :param link_table: The link table nme or file
        """
        if not link_table.endswith('.csv'):
            link_table += '.csv'
        return pd.read_csv(path.join(self.__link_table_path(), link_table))

    def drop_main_table_column(self, column):
        self.get_main_table()
        self.main_table.drop(column, axis=1, inplace=True)

    def save_subset(self, path, main_table_filterer, copy_link_tables=True):
        os.makedirs(os.path.join(path, ProgSnap2Dataset.CODE_STATES_DIR), exist_ok=True)
        main_table = main_table_filterer(self.get_main_table())
        main_table.to_csv(os.path.join(path, ProgSnap2Dataset.MAIN_TABLE_FILE), index=False)
        code_state_ids = main_table[PS2.CodeStateID].unique()
        code_states = self.get_code_states_table()
        code_states = code_states[code_states[PS2.CodeStateID].isin(code_state_ids)]
        code_states.to_csv(os.path.join(path, ProgSnap2Dataset.CODE_STATES_DIR, 'CodeStates.csv'), index=False)
        self.metadata_table.to_csv(os.path.join(path, ProgSnap2Dataset.METADATA_TABLE_FILE), index=False)

        if not copy_link_tables:
            return

        os.makedirs(os.path.join(path, ProgSnap2Dataset.LINK_TABLE_DIR), exist_ok=True)

        def indexify(x):
            return tuple(x) if len(x) > 1 else x[0]

        for link_table_name in self.list_link_tables():
            link_table = self.load_link_table(link_table_name)
            columns = [col for col in link_table.columns if col.endswith('ID') and col in main_table.columns]
            distinct_ids = main_table.groupby(columns).apply(lambda x: True)
            # TODO: Still need to test this with multi-ID link tables
            to_keep = [indexify(list(row)) in distinct_ids for index, row in link_table[columns].iterrows()]
            filtered_link_table = link_table[to_keep]
            filtered_link_table.to_csv(os.path.join(path, ProgSnap2Dataset.LINK_TABLE_DIR, link_table_name), index=False)



    @staticmethod
    def __to_one(lst, error):
        if len(lst) == 0:
            return None
        if len(lst) > 1:
            raise Exception(error or 'Should have only one result!')
        return lst.iloc[0]

    def get_code_for_id(self, code_state_id):
        if code_state_id is None:
            return None
        code_states = self.get_code_states_table()
        code = code_states[code_states[PS2.CodeStateID] == code_state_id][PS2.Code]
        return ProgSnap2Dataset.__to_one(code, 'Multiple code states match that ID.')

    def get_code_for_event_id(self, row_id):
        events = self.get_main_table()
        code_state_ids = events[events[PS2.EventID == row_id]][PS2.CodeStateID]
        code_state_id = ProgSnap2Dataset.__to_one(code_state_ids, 'Multiple rows match that ID.')
        return self.get_code_for_id(code_state_id)

    def get_subject_ids(self):
        events = self.get_main_table()
        return events[PS2.SubjectID].unique()

    def get_problem_ids(self):
        events = self.get_main_table()
        return events[PS2.ProblemID].unique()

    def get_trace(self, subject_id, problem_id):
        events = self.get_main_table()
        rows = events[(events[PS2.SubjectID] == subject_id) & (events[PS2.ProblemID] == problem_id)]
        ids = rows[PS2.CodeStateID].unique()
        return [self.get_code_for_id(code_state_id) for code_state_id in ids]


if __name__ == '__main__':
  data = ProgSnap2Dataset('/content/drive/Shareddrives/Learning Analytics/data/Release/S19/Train/Data/')    # for code in data.get_trace('4d230b683bf9840553ae57f4acc96e81', 32):
    #     print(code)
    #     print('-------')

  data.save_subset('data/test/CopyA', lambda df: df[df[PS2.SubjectID].str.startswith('a')])


In [None]:
TRAIN_PATH = '/content/drive/Shareddrives/Learning Analytics/data/Release/S19/Train'
train_ps2 = ProgSnap2Dataset(os.path.join(TRAIN_PATH, 'Data')) 

## Error Quotient code AND Score Regression Feature

In [None]:
from enum import Enum
from sklearn.linear_model import LinearRegression


def get_error_type(error):
  if "cannot find symbol: variable" in error:
    return 1
  elif "';' expected" in error:
    return 2
  elif "'(' expected" in error or "expected" in error or "')' expected" in error or "'[' expected" in error or "']' expected" in error or "'{' expected" in error or "'}' expected" in error :
    return 3
  elif "missing return statement" in error:
    return 4
  elif "cannot find symbol: method" in error or "cannot find symbol" in error:
    return 5
  elif "illegal start of" in error:
    return 6
  elif "incompatible types" in error:
    return 7
  elif "<identifier> expected" in error:
    return 8
  elif "class, interface, or enum expected" in error:
    return 9
  elif "'else' without 'if'" in error:
    return 10
  elif "bad operand" in error:
    return 11
  elif "cannot be dereferenced" in error:
    return 12
  elif "incomparable types" in error:
    return 13
  elif "illegal character" in error or "illegal" in error:
    return 14
  elif "not a statement" in error:
    return 15
  elif "might not have been initialized" in error:
    return 16
  elif "unreachable statement" in error:
    return 17
  elif "no suitable method found" in error:
    return 18
  elif "reached end of file while parsing" in error:
    return 19
  elif "unclosed" in error or "literal" in error:
    return 20
  elif "is already defined" in error:
    return 21
  elif "empty statement after if" in error:
    return 22
  elif "variable declaration not allowed here" in error:
    return 23
  elif "array required" in error:
    return 24
  elif "invalid method declaration" in error:
    return 25
  elif "not applicable" in error:
    return 26
  elif "cannot be applied" in error or "cannot assign" in error:
    return 27
  elif "no suitable constructor" in error:
    return 28
  elif "cannot be referenced" in error:
    return 29
  elif "bad initializer" in error:
    return 30
  elif "does not exist" in error:
    return 31
  else:
    return -1


##This is the table were going to do the logic to make it easy to implment Jaduds algorithm

main_table = train_ps2.get_main_table()

regression_score =  main_table[["SubjectID", "ServerTimestamp","ProblemID","EventType","Score","Compile.Result", "CompileMessageType", "CompileMessageData"]]
regression_score = regression_score[regression_score.EventType == "Run.Program"]

eq_algo = main_table[["SubjectID", "ServerTimestamp","ProblemID","EventType", "Compile.Result", "CompileMessageType", "CompileMessageData"]]
eq_algo[["CompileMessageData", "CompileMessageType"]] = eq_algo[["CompileMessageData", "CompileMessageType"]].shift(-1)
eq_algo = eq_algo[eq_algo.EventType == "Compile"]
eq_algo

def make_pairs(error_arr):
  err_pairs = list(map(list, zip(error_arr, error_arr[1:])))
  return err_pairs
  

import math
from math import nan
def get_eq_score(c_r, c_m, eq_1, eq_2):
  c_r_pair = make_pairs(c_r)
  c_m_pair = make_pairs(c_m)
  errors_for_pairs = []
  tot = 0.0
  for i in range(len(c_r_pair)):
    curr_eq = 0
    if c_r_pair[i][0] == "Error" and c_r_pair[i][1] == "Error":
      curr_eq = curr_eq + eq_1   
      if(get_error_type(c_m_pair[i][0]) == get_error_type(c_m_pair[i][1])):
        curr_eq = curr_eq + eq_2      
      tot = tot + (curr_eq / (eq_1 + eq_2))

  return tot/len(c_r_pair)
  
def get_eq_scores(df, eq_1, eq_2):
  subject_ids = [id[0] for id in df[["SubjectID"]].values]
  problem_ids = [id[0] for id in df[["ProblemID"]].values]

  EQ_scores = []
  
  for i in range(len(subject_ids)):
    one_student = eq_algo[eq_algo["SubjectID"] == subject_ids[i]]
    error_arr = one_student.loc[one_student['ProblemID'] == problem_ids[i]]
    compile_result = [i[0] for i in error_arr[['Compile.Result']].values]
    compile_message = [i[0] for i in error_arr[['CompileMessageData']].values]

    if not np.any(error_arr) or len(error_arr) == 1:
      EQ_scores.append(0.0)
    else:
      EQ_scores.append(get_eq_score(compile_result, compile_message, eq_1, eq_2))
  df.insert(3, "EQ", EQ_scores, True)
  return EQ_scores


def get_score_regression(df):
  subject_ids = [id[0] for id in df[["SubjectID"]].values]
  problem_ids = [id[0] for id in df[["ProblemID"]].values]

  regression_scores = []

  for i in range(len(subject_ids)):

    one_student = regression_score[regression_score["SubjectID"] == subject_ids[i]]
    error_arr = one_student.loc[one_student['ProblemID'] == problem_ids[i]]
    scores = np.array([j[0] for j in error_arr[["Score"]].values])

    if(len(scores) == 0):
      regression_scores.append(0.0)
    elif(len(scores) == 1):
      regression_scores.append(scores[0])
    else:


      x = np.arange(0, len(scores), 1).reshape(-1,1)

      y = scores
      lmodel = LinearRegression()
      lmodel = LinearRegression().fit(x, y)
      r_sq = lmodel.score(x, y)
      regression_scores.append(r_sq)
  df.insert(3, "rscores", regression_scores, True)
  return regression_scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
early_train = pd.read_csv(os.path.join(TRAIN_PATH, 'early.csv'))
get_eq_scores(early_train, 1, 3)
# get_score_regression(early_train)
late_train = pd.read_csv(os.path.join(TRAIN_PATH, 'late.csv'))

X_train_base = late_train.copy().drop('Label', axis=1)
y_train = late_train['Label'].values
problem_encoder = OneHotEncoder().fit(X_train_base[PS2.ProblemID].values.reshape(-1, 1))
problem_encoder.transform(X_train_base[PS2.ProblemID].values.reshape(-1, 1)).toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

# Task 1
In this task, we do per-problem prediction, extracting features from performance on the 30 early problems for a given student to predict performance on each of 20 later problems. Our model should, in effect, learn the releationship between the knowledge practiced in these problems (though our naive example here won't get that far).

In [None]:
def extract_instance_features(instance, early_df):
    
    instance = instance.copy()
    subject_id = instance[PS2.SubjectID]
    early_problems = early_df[early_df[PS2.SubjectID] == subject_id]
    # Extract very naive features about the student
    # (without respect to the problem bring predicted)
    # Number of early problems attempted
    instance['ProblemsAttempted'] = early_problems.shape[0]
    # Percentage of early problems gotten correct eventually
    instance['PercCorrectEventually'] = np.mean(early_problems['CorrectEventually'])
    # Median attempts made on early problems
    instance['MedAttempts'] = np.median(early_problems['Attempts'])
    
    # Max attempts made on early problems
    instance['MaxAttempts'] = np.max(early_problems['Attempts'])
    # Percentage of problems gotten correct on the first try
    instance['PercCorrectFirstTry'] = np.mean(early_problems['Attempts'] == 1)

    # Mean of EQ scores for  each problem
    instance['EQ'] = np.mean(early_problems['EQ'])
   ## instance['rscores'] = np.mean(early_problems['rscores'])
    
    instance = instance.drop('SubjectID')
    return instance
def extract_features(X, early_df, scaler, is_train):
    # First extract performance features for each row
    features = X.apply(lambda instance: extract_instance_features(instance, early_df), axis=1)
    # Then one-hot encode the problem_id and append it
    problem_ids = problem_encoder.transform(features[PS2.ProblemID].values.reshape(-1, 1)).toarray()
    # Then get rid of nominal features
    features.drop([PS2.AssignmentID, PS2.ProblemID], axis=1, inplace=True)
    # Then scale the continuous features, fitting the scaler if this is training
    if is_train:
        scaler.fit(features)
    features = scaler.transform(features)
    
    # Return continuous and one-hot features together
    return np.concatenate([features, problem_ids], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = extract_features(X_train_base, early_train, scaler, True)

## Evaluate the Training Performance of the Model

In [None]:
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV()
model.fit(X_train, y_train)
train_predictions = model.predict(X_train)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

print(classification_report(y_train, train_predictions))
print('AUC: ' + str(roc_auc_score(y_train, train_predictions)))
print('Macro F1: ' + str(f1_score(y_train, train_predictions, average='macro')))

              precision    recall  f1-score   support

       False       0.58      0.27      0.37      1084
        True       0.79      0.93      0.85      3117

    accuracy                           0.76      4201
   macro avg       0.68      0.60      0.61      4201
weighted avg       0.73      0.76      0.73      4201

AUC: 0.6023439192524745
Macro F1: 0.6122778887857789


In [None]:
from sklearn.model_selection import cross_validate

# model = LogisticRegressionCV()
model = softCL
cv_results = cross_validate(model, X_train, y_train, cv=10, scoring=['accuracy', 'f1_macro', 'roc_auc'])
print(f'Accuracy: {np.mean(cv_results["test_accuracy"])}')
print(f'AUC: {np.mean(cv_results["test_roc_auc"])}')
print(f'Macro F1: {np.mean(cv_results["test_f1_macro"])}')

### Testing Sensitivity for EQ algorithm

In [None]:
from sklearn.model_selection import cross_validate

scaler = StandardScaler()

eq_scores = []
for x in range(1, 10):
  row = []
  for y in range(1, 10):
    get_eq_scores(early_train, x, y)

    # get_eq_scores(late_train, x, y)
    X_train_base = late_train.copy().drop('Label', axis=1)
    X_train = extract_features(X_train_base, early_train, scaler, True)
    model = LogisticRegressionCV()
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=['accuracy', 'f1_macro', 'roc_auc'])

    # model.fit(X_train, y_train)
    # train_predictions = model.predict(X_train)

    result = np.mean(cv_results["test_roc_auc"])
    print(f"This is x: {x} This is y: {y}, this is score {result}")

    # eq_scores[x].append(np.mean(cv_results["test_roc_auc"])
    # row.append(roc_auc_score(y_train, train_predictions))
    row.append(result)

eq_scores.append(row)

print(eq_scores)

In [None]:
np.array(eq_scores)

array([[0.60258424, 0.60336632, 0.60304549, 0.6032059 , 0.60366716,
        0.603968  , 0.60412841, 0.60412841, 0.60426885, 0.60378762]])

In [None]:
import pickle as pkl
pkl.dump(eq_scores, open('3darray.pkl','wb'))

### Predict on the test data for the next semester (F19)

---



In [None]:
F19_TEST_PATH = '/content/drive/Shareddrives/Learning Analytics/data/Release/F19/Test'


train_ps2 = ProgSnap2Dataset(os.path.join(F19_TEST_PATH, 'Data'))
#run eq code once more

In [None]:
early_test = pd.read_csv(os.path.join(F19_TEST_PATH, 'early.csv'))
get_eq_scores(early_test, 1, 3)
#get_score_regression(early_test)
late_test = pd.read_csv(os.path.join(F19_TEST_PATH, 'late.csv'))

X_test = extract_features(late_test, early_test, scaler, False)

In [None]:
model = LogisticRegressionCV()
model.fit(X_train, y_train)
predictions = model.predict_proba(X_test)[:,1]

predictions_df = late_test.copy()
predictions_df['Label'] = predictions
predictions_df

Unnamed: 0,SubjectID,AssignmentID,ProblemID,Label
0,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,41,0.819182
1,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,43,0.757209
2,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,44,0.879472
3,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,46,0.742706
4,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,49,0.777682
...,...,...,...,...
2360,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,64,0.316928
2361,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,70,0.174019
2362,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,71,0.298714
2363,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,112,0.273103


In [None]:
model.fit(X_train, y_train)
predictions = model.predict_proba(X_test)[:,1]

predictions_df = late_test.copy()
predictions_df['Label'] = predictions
predictions_df
predictions_df.to_csv('predictions.csv')

In [None]:

predictions = softCL.predict(X_test)
predictions_df = late_test.copy()
predictions_df['Label'] = predictions
predictions_df
predictions_df.to_csv('predictions.csv')

In [None]:
predictions_df

Unnamed: 0,SubjectID,AssignmentID,ProblemID,Label
0,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,41,True
1,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,43,True
2,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,44,True
3,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,46,True
4,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,49,True
...,...,...,...,...
2360,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,64,False
2361,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,70,False
2362,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,71,False
2363,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,112,False


# Task 1 - Voting Classiffiers

In [None]:
y_train

array([False,  True,  True, ...,  True,  True,  True])

### HyperParameter tuning MLP

In [None]:
mlp_gs = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(X_train, y_train) # X is train samples and y is the corresponding labels



GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=100), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(10, 30, 10), (20,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']})

In [None]:
clf.best_params_

{'activation': 'relu',
 'alpha': 0.0001,
 'hidden_layer_sizes': (20,),
 'learning_rate': 'constant',
 'solver': 'adam'}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]

}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)




GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=100), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(10, 30, 10), (20,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']})

In [None]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [None]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 110,
 'max_features': 2,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(bootstrap= True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=100)
lr = LogisticRegressionCV()
mlp = MLPClassifier(solver='adam', alpha=0.0001,hidden_layer_sizes=(20, ), random_state=1, max_iter=1000)


# {'activation': 'relu',
#  'alpha': 0.0001,
#  'hidden_layer_sizes': (20,),
#  'learning_rate': 'constant',
#  'solver': 'adam'}

softCL = VotingClassifier(
    estimators=[
                ('mlp', mlp), ('lr', lr), ('rf', rf)
    ], voting="soft")
softCL = softCL.fit(X_train, y_train)

In [None]:
predictions = softCL.predict_proba(X_test)[:,1]

predictions_df = late_test.copy()
predictions_df['Label'] = predictions
predictions_df
predictions_df.to_csv('predictions.csv')

In [None]:
predictions_df

Unnamed: 0,SubjectID,AssignmentID,ProblemID,Label
0,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,41,0.783177
1,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,43,0.694655
2,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,44,0.817081
3,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,46,0.709258
4,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,49,0.759184
...,...,...,...,...
2360,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,64,0.485621
2361,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,70,0.346617
2362,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,71,0.501970
2363,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,112,0.458825


In [None]:
get_error_type("line 38: error: illegal start of expression")

6

# Task 2 Regression

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def randf(n_estimators, max_depth, min_weight_fraction_leaf, max_features, bootstrap):
  return RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,
                                max_features=max_features, bootstrap=bootstrap)

p_grid = {
    "n_estimators" : [50, 100, 150, 200],
    "max_depth" : [None, 10, 50, 100, 200],
    "min_weight_fraction_leaf" : [0.0, 0.2, 0.5, 0.75, 0.95],
    "max_features" : ["auto", "sqrt", "log2"],
    "bootstrap" : [True, False],
}

ran_for_base = randf(50, None, 0.0, "auto", True)
gs = GridSearchCV(RandomForestClassifier(), p_grid, scoring="accuracy", cv=5)
gs.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(gs.cv_results_["params"])
results["mean score"] = gs.cv_results_["mean_test_score"]
results.sort_values(by="mean score", ascending=False).head(20)

Unnamed: 0,bootstrap,max_depth,max_features,min_weight_fraction_leaf,n_estimators,mean score
591,False,200.0,log2,0.5,200,0.741966
564,False,200.0,sqrt,0.2,50,0.741966
327,False,,sqrt,0.2,200,0.741966
326,False,,sqrt,0.2,150,0.741966
325,False,,sqrt,0.2,100,0.741966
324,False,,sqrt,0.2,50,0.741966
550,False,200.0,auto,0.5,150,0.741966
551,False,200.0,auto,0.5,200,0.741966
565,False,200.0,sqrt,0.2,100,0.741966
329,False,,sqrt,0.5,100,0.741966


In [None]:
def svc_build():
  return SVC(kernel=kernel, degree=degree, coef0=coef0, probability=True)

p_grid = {
    "kernel" : ["linear", "poly", "rbf", "sigmoid"],
    "degree" : [3, 4, 6],
    "coef0" : [0.0, 0.2, 0.5, 0.75, 0.95],
}

gs = GridSearchCV(SVC(), p_grid, scoring="accuracy", cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'coef0': [0.0, 0.2, 0.5, 0.75, 0.95],
                         'degree': [3, 4, 6],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='accuracy')

In [None]:
results = pd.DataFrame(gs.cv_results_["params"])
results["mean score"] = gs.cv_results_["mean_test_score"]
results.sort_values(by="mean score", ascending=False)

Unnamed: 0,coef0,degree,kernel,mean score
37,0.75,3,poly,0.751723
49,0.95,3,poly,0.751485
25,0.5,3,poly,0.750058
1,0.0,3,poly,0.74887
13,0.2,3,poly,0.748154
9,0.0,6,poly,0.746486
0,0.0,3,linear,0.746484
48,0.95,3,linear,0.746484
32,0.5,6,linear,0.746484
28,0.5,4,linear,0.746484


In [None]:
cherry_forest = RandomForestClassifier(bootstrap=False, max_depth=200,
                                       max_features="log2", min_weight_fraction_leaf=0.5,
                                       n_estimators=200)
neigh_cherry = knc(n_neighbors=9, weights="uniform")
svc_cherry = SVC(coef0=0.75, degree=3, kernel="poly", probability=True)


optiVote = VotingClassifier(
    estimators=[
                ('svc', svc_cherry), ('rf', cherry_forest)
    ], voting='hard', weights=[0.55, 0.45])
optiVote = optiVote.fit(X_train, y_train)

In [None]:
cross_val_score(optiVote, X_train, y_train, cv=5)

array([0.76337693, 0.75952381, 0.76666667, 0.73214286, 0.73690476])

In [None]:
optiVote.score(X_train, y_train)

0.8040942632706498

In [None]:
cherry_forest = RandomForestClassifier(bootstrap=False, max_depth=200,
                                       max_features="log2", min_weight_fraction_leaf=0.5,
                                       n_estimators=200)
neigh_cherry = knc(n_neighbors=9, weights="uniform")
svc_cherry = SVC(coef0=0.75, degree=3, kernel="poly", probability=True)


tweakoptiVote = VotingClassifier(
    estimators=[
                ('svc', svc_cherry), ('rf', cherry_forest)
    ], voting='hard', weights=[0.90, 0.45])
tweakoptiVote = tweakoptiVote.fit(X_train, y_train)

In [None]:
cross_val_score(tweakoptiVote, X_train, y_train, cv=5)

array([0.76337693, 0.75952381, 0.76666667, 0.73214286, 0.73690476])

In [None]:
tweakoptiVote.score(X_train, y_train)

0.8040942632706498

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
a_clf = AdaBoostClassifier()
a_clf.fit(X_train, y_train)
cross_val_score(a_clf, X_train, y_train, cv=5)

array([0.73840666, 0.73809524, 0.73333333, 0.72619048, 0.77380952])

In [None]:
a_clf.score(X_train, y_train)

0.7852892168531302

In [None]:
cherry_forest = RandomForestClassifier(bootstrap=False, max_depth=200,
                                       max_features="log2", min_weight_fraction_leaf=0.5,
                                       n_estimators=200)
ada_cherry = AdaBoostClassifier(learning_rate=0.25, n_estimators=20)
svc_cherry = SVC(coef0=0.75, degree=3, kernel="poly", probability=True)


impVote = VotingClassifier(
    estimators=[
                ('svc', svc_cherry), ('ada', ada_cherry), ('rf', cherry_forest)
    ], voting='hard', weights=[0.95, 0.2, 0.55])
impVote = impVote.fit(X_train, y_train)

In [None]:
cross_val_score(impVote, X_train, y_train, cv=5)

array([0.76337693, 0.75952381, 0.76666667, 0.73214286, 0.73690476])

In [None]:
impVote.score(X_train, y_train)

0.8040942632706498

In [None]:
def aclf(n_estimators, learning_rate, base_estimator):
  return AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
                            base_estimator=base_estimator)
nval = knc(n_neighbors=9, weights="uniform")

p_grid = {
    "n_estimators" : [10, 20, 50, 100, 150, 200],
    "learning_rate" : [1.0, 0.5, 2, 0.001, 0.25, 0.1, 0.3, 0.95],
}

gs = GridSearchCV(AdaBoostClassifier(), p_grid, scoring="accuracy", cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [1.0, 0.5, 2, 0.001, 0.25, 0.1, 0.3,
                                           0.95],
                         'n_estimators': [10, 20, 50, 100, 150, 200]},
             scoring='accuracy')

In [None]:
results = pd.DataFrame(gs.cv_results_["params"])
results["mean score"] = gs.cv_results_["mean_test_score"]
results.sort_values(by="mean score", ascending=False)

Unnamed: 0,learning_rate,n_estimators,mean score
25,0.25,20,0.748154
37,0.3,20,0.743396
24,0.25,10,0.743155
33,0.1,100,0.741968
2,1.0,50,0.741967
23,0.001,200,0.741966
21,0.001,100,0.741966
20,0.001,50,0.741966
19,0.001,20,0.741966
18,0.001,10,0.741966
