# import libraries

In [115]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import pickle

# import all the data

In [108]:
train_y= pd.read_csv('train.csv').Relations
test_y= pd.read_csv('test.csv').Relations
# pos and enr
train_x_pos_enr= pd.read_csv('train_encodings_pos_enr.csv')
test_x_pos_enr= pd.read_csv('test_encodings_pos_enr.csv')
# e1 and e2
train_x_e1_e2= pd.read_csv('train_encodings_e1_e2.csv')
test_x_e1_e2= pd.read_csv('test_encodings_e1_e2.csv')
# SDP
train_x_SDP= pd.read_csv('train_encodings_SDP.csv')
test_x_SDP= pd.read_csv('test_encodings_SDP.csv')
# words in between 
train_x_words_in_between= pd.read_csv('train_enc_words_in_between.csv')
test_x_words_in_between= pd.read_csv('test_enc_words_in_between.csv')
# root words
train_x_root= pd.read_csv('train_enc_root.csv')
test_x_root= pd.read_csv('test_enc_root.csv')
# pos_enr_e1_e2
train_x_pos_enr_e1_e2= pd.concat([train_x_pos_enr, train_x_e1_e2], axis=1)
test_x_pos_enr_e1_e2= pd.concat([test_x_pos_enr, test_x_e1_e2], axis=1)
# SDP with pos, enr, e1, e2
train_x_pos_enr_e1_e2_SDP= pd.concat([train_x_pos_enr_e1_e2, train_x_SDP], axis=1)
test_x_pos_enr_e1_e2_SDP= pd.concat([test_x_pos_enr_e1_e2, test_x_SDP], axis=1)
# pos, enr, e1, e2, root_e1_e2, words in between, SDP
train_pos_enr_e1e2_root_between= pd.read_csv('train_pos_enr_e1e2_root_between.csv')
test_pos_enr_e1e2_root_between= pd.read_csv('test_pos_enr_e1e2_root_between.csv')

# calculate testing accuracy, precision, recall, f1 score

In [4]:
def test_metrics(model, test_x, test_y):
    '''
    There are 2 types of evaluation.
    1. test_y and test_predict exactly matches. i.e. relation and direction both are correct
    2. Only relation matches irrespective of direction. 
    
    We find accuracy, precision, recall and f1 macro scores for the above 2 criteria. 
    '''
    test_predict= pd.Series(model.predict(test_x))
    
    # break test_y and test_predict to get only relation, and check metrics only for relation part. 
    test_y_relation = test_y.apply(lambda y: y.split('(')[0])
    test_predict_relation= test_predict.apply(lambda y: y.split('(')[0])
    
    accuracy= accuracy_score(test_y, test_predict)
    precision= precision_score(test_y, test_predict, average='macro')
    recall= recall_score(test_y, test_predict, average='macro')
    f1= f1_score(test_y, test_predict, average='macro')
    accuracy= accuracy_score(test_y, test_predict)
    
    accuracy_relation= accuracy_score(test_y_relation, test_predict_relation)
    precision_relation= precision_score(test_y_relation, test_predict_relation, average='macro')
    recall_relation= recall_score(test_y_relation, test_predict_relation, average='macro')
    f1_relation= f1_score(test_y_relation, test_predict_relation, average='macro')
    
    return accuracy, precision, recall, f1, accuracy_relation, precision_relation, recall_relation, f1_relation

# we try 3 models, decision tree, svm and xgboost on our training data and find metrics over the test data

In [None]:
def run_model(train_x, train_y, test_x, test_y, features):
    models = [DecisionTreeClassifier(), SVC(), XGBClassifier()]
    names = ["Decision_Tree", "SVM", "XGBoost"]
    results, model_names=[], []

    for name, model in zip(names, models):
        model.fit(train_x, train_y)
        result= test_metrics(model, test_x, test_y)
        results.append(result)
        model_names.append(name + '_' + features)
        
        print('Metrics for {} model is {}'.format(name, result))
        
    model_df= pd.DataFrame(model_names, columns=['model_name'])
    result_df= pd.DataFrame(results, columns=['accuracy','precision', 'recall', 'f1', 'accuracy_relation', 'precision_relation','recall_relation','f1_relation'])
    df= pd.concat([model_df, result_df], axis=1)

    return df

# try on only 4 features, pos1, pos2, enr1, enr2 

In [154]:
results= run_model(train_x_pos_enr, train_y, test_x_pos_enr, test_y, 'pos_enr')

Metrics for Decision_Tree model is (0.25878114680276193, 0.2251329320298018, 0.19850065630844363, 0.17944705828016588, 0.27409186430501353, 0.24183202315714317, 0.2117070026283371, 0.19818225060249664)


  _warn_prf(average, modifier, msg_start, len(result))


Metrics for SVM model is (0.2674872410687481, 0.24146031353392086, 0.19687882785394048, 0.16507494400543402, 0.2842990093065146, 0.34752433877102856, 0.2093120581707783, 0.18457175421791633)
Metrics for XGBoost model is (0.2653857700390273, 0.17080885609683144, 0.20205569884965893, 0.16424282287404113, 0.27979585709996996, 0.24082638024059758, 0.21705345720783117, 0.18872406887020274)


In [155]:
results

Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724


# lets improve SVM and perform GridSearchCV

In [105]:
param= {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 5, 10]}
clf= GridSearchCV(SVC(), param, scoring='accuracy')
clf.fit(train_x_pos_enr, train_y)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 5, 10], 'kernel': ('linear', 'rbf')},
             scoring='accuracy')

In [107]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [108]:
clf.best_score_

0.31266299623297594

In [109]:
# test the best SVM model on the test set
result= test_metrics(clf, test_x_pos_enr, test_y)

  _warn_prf(average, modifier, msg_start, len(result))


In [110]:
result

(0.2674872410687481,
 0.24146031353392086,
 0.19687882785394048,
 0.16507494400543402)

In [111]:
# this is same as default!

# train model on e1 and e2

In [157]:
results1= run_model(train_x_e1_e2, train_y, test_x_e1_e2, test_y, 'e1_e2')

Metrics for Decision_Tree model is (0.280096067247073, 0.27038651457835416, 0.25189363511261564, 0.25855364194370684, 0.2957069948964275, 0.2920657495884922, 0.27349640375267836, 0.28101706512600527)


  _warn_prf(average, modifier, msg_start, len(result))


Metrics for SVM model is (0.2990093065145602, 0.36116882747954127, 0.1939022526109584, 0.2095955351365069, 0.322425697988592, 0.396871044592977, 0.22173749175690646, 0.24131607169820227)
Metrics for XGBoost model is (0.4344040828580006, 0.4558466163079409, 0.39395723597300886, 0.39413285243671825, 0.4485139597718403, 0.47844513549090983, 0.4105667130250874, 0.4148903120691334)


In [158]:
results2= pd.concat([results, results1])

In [159]:
results2

Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
0,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
1,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
2,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489


# use pos, enr, e1, e2 as features

In [161]:
results3= run_model(train_x_pos_enr_e1_e2, train_y, test_x_pos_enr_e1_e2, test_y, 'pos_enr_e1_e2')

Metrics for Decision_Tree model is (0.3152206544581207, 0.31576533985388594, 0.30008778398017677, 0.30583899930161534, 0.3341338937256079, 0.3381872673200819, 0.3260379313455465, 0.3311275332145162)


  _warn_prf(average, modifier, msg_start, len(result))


Metrics for SVM model is (0.33383368357850496, 0.37902770747993636, 0.23094415560300605, 0.24115156336255905, 0.3464425097568298, 0.4264696144664362, 0.25505234574337715, 0.2745592177490983)
Metrics for XGBoost model is (0.4587211047733413, 0.4626393750727147, 0.4073446753379096, 0.4080917962728326, 0.473131191834284, 0.5066455799228233, 0.43082570043031426, 0.4370827719331995)


In [162]:
results4= pd.concat([results2, results3])

In [163]:
results4

Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
0,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
1,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
2,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489
0,Decision_Tree_pos_enr_e1_e2,0.315221,0.315765,0.300088,0.305839,0.334134,0.338187,0.326038,0.331128
1,SVM_pos_enr_e1_e2,0.333834,0.379028,0.230944,0.241152,0.346443,0.42647,0.255052,0.274559
2,XGBoost_pos_enr_e1_e2,0.458721,0.462639,0.407345,0.408092,0.473131,0.506646,0.430826,0.437083


# use shortest dependency path as features

In [165]:
results5= run_model(train_x_SDP, train_y, test_x_SDP, test_y, 'SDP')

Metrics for Decision_Tree model is (0.17802461723206245, 0.1528206351454658, 0.13823521517904594, 0.133886970460789, 0.22425697988592014, 0.21297495193600846, 0.19231701236287133, 0.18692936030948412)


  _warn_prf(average, modifier, msg_start, len(result))


Metrics for SVM model is (0.1840288201741219, 0.09850905644499663, 0.09759853256492868, 0.0761310946799203, 0.22305613929750825, 0.1780145323683799, 0.13302665419624765, 0.10404084237426586)
Metrics for XGBoost model is (0.20894626238366856, 0.26213112824677204, 0.13896511801103018, 0.14192001507076255, 0.25427799459621736, 0.3705503740206382, 0.18510546647576648, 0.1887895264525192)


In [166]:
results6= pd.concat([results4, results5])

In [167]:
results6

Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
0,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
1,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
2,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489
0,Decision_Tree_pos_enr_e1_e2,0.315221,0.315765,0.300088,0.305839,0.334134,0.338187,0.326038,0.331128
1,SVM_pos_enr_e1_e2,0.333834,0.379028,0.230944,0.241152,0.346443,0.42647,0.255052,0.274559
2,XGBoost_pos_enr_e1_e2,0.458721,0.462639,0.407345,0.408092,0.473131,0.506646,0.430826,0.437083
0,Decision_Tree_SDP,0.178025,0.152821,0.138235,0.133887,0.224257,0.212975,0.192317,0.186929


In [168]:
results7=results6.reset_index(drop=True)

In [172]:
results7.to_csv('results.csv', index=False)

# trying word embeddings using padding technique

In [173]:
train_x_e1_e2_padding= pd.read_csv('train_encodings_e1_e2_padding.csv')
test_x_e1_e2_padding= pd.read_csv('test_encodings_e1_e2_padding.csv')

In [175]:
results8= run_model(train_x_e1_e2_padding, train_y, test_x_e1_e2_padding, test_y, 'e1_e2_padding')

Metrics for Decision_Tree model is (0.28459921945361755, 0.27314792327170556, 0.2589556532116514, 0.2638655373567874, 0.30021014710297206, 0.30100922921122875, 0.2869332435278606, 0.2924721235605481)


  _warn_prf(average, modifier, msg_start, len(result))


Metrics for SVM model is (0.31702191534073854, 0.3870590376773628, 0.21637373676863197, 0.23565953515342064, 0.3308315821074752, 0.38790871504974483, 0.24204598943065986, 0.2595684661015117)
Metrics for XGBoost model is (0.4596217352146503, 0.467793366530815, 0.41453028214432325, 0.4155103153710309, 0.47343140198138695, 0.5075276664172016, 0.43043390807631693, 0.4366009984894648)


In [176]:
results9= pd.concat([results7, results8])

In [177]:
results9.to_csv('results.csv', index=False)

In [178]:
results9

Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
3,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
4,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
5,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489
6,Decision_Tree_pos_enr_e1_e2,0.315221,0.315765,0.300088,0.305839,0.334134,0.338187,0.326038,0.331128
7,SVM_pos_enr_e1_e2,0.333834,0.379028,0.230944,0.241152,0.346443,0.42647,0.255052,0.274559
8,XGBoost_pos_enr_e1_e2,0.458721,0.462639,0.407345,0.408092,0.473131,0.506646,0.430826,0.437083
9,Decision_Tree_SDP,0.178025,0.152821,0.138235,0.133887,0.224257,0.212975,0.192317,0.186929


# when each word is encoded to 100 vectors with padding added, we got 2000 features for e1 and e2. Traied over xgboost took lot of time, and improvement in accuracy was just 2%. So sticked with summing up the encoding for the words to get 100 vectors only in total. 

# xgboost gives the best performance, so will use that further

In [60]:
def run_xgboost(train_x, train_y, test_x, test_y):
    model = XGBClassifier()
    name = "XGBoost"

    model.fit(train_x, train_y)
    result= [test_metrics(model, test_x, test_y)]

    print('Metrics for {} model is {}'.format(name, result))
    return result

In [65]:
def add_result(result, features):
        
    model_df= pd.DataFrame(['XGBoost_'+ features], columns=['model_name'])
    result_df= pd.DataFrame(result, columns=['accuracy','precision', 'recall', 'f1', 'accuracy_relation', 'precision_relation','recall_relation','f1_relation'])
    df= pd.concat([model_df, result_df], axis=1)
    
    # read results file, add new result to it, again save it
    final_df= pd.concat([pd.read_csv('results.csv'), df])
    final_df.to_csv('results.csv', index=False)
    return final_df

# use words in between with xgb

In [66]:
result = run_xgboost(train_x_enc_words_in_between, train_y,test_x_enc_words_in_between, test_y)
add_result(result, 'words_in_between')

Metrics for XGBoost model is [(0.1570099069348544, 0.2392890988857448, 0.09280828788416151, 0.09045715155722138, 0.19933953767637347, 0.27316396437118334, 0.12620498194851806, 0.1181060537931425)]


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
3,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
4,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
5,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489
6,Decision_Tree_pos_enr_e1_e2,0.315221,0.315765,0.300088,0.305839,0.334134,0.338187,0.326038,0.331128
7,SVM_pos_enr_e1_e2,0.333834,0.379028,0.230944,0.241152,0.346443,0.42647,0.255052,0.274559
8,XGBoost_pos_enr_e1_e2,0.458721,0.462639,0.407345,0.408092,0.473131,0.506646,0.430826,0.437083
9,Decision_Tree_SDP,0.178025,0.152821,0.138235,0.133887,0.224257,0.212975,0.192317,0.186929


# Build model using root word

In [99]:
result = run_xgboost(train_x_root, train_y, test_x_root, test_y)
add_result(result, 'root')

Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
3,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
4,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
5,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489
6,Decision_Tree_pos_enr_e1_e2,0.315221,0.315765,0.300088,0.305839,0.334134,0.338187,0.326038,0.331128
7,SVM_pos_enr_e1_e2,0.333834,0.379028,0.230944,0.241152,0.346443,0.42647,0.255052,0.274559
8,XGBoost_pos_enr_e1_e2,0.458721,0.462639,0.407345,0.408092,0.473131,0.506646,0.430826,0.437083
9,Decision_Tree_SDP,0.178025,0.152821,0.138235,0.133887,0.224257,0.212975,0.192317,0.186929


# use pos_enr, e1, e2, root_e1, root_e2, words_in_between 

In [109]:
result= run_xgboost(train_pos_enr_e1e2_root_between, train_y, test_pos_enr_e1e2_root_between, test_y)
add_result(result, 'pos_enr_e1e2_root_between')

Metrics for XGBoost model is [(0.48874211948363855, 0.5282345252927133, 0.4526012596555818, 0.46578577949035205, 0.5022515761032723, 0.5744090479329544, 0.4729410194896628, 0.4939676622081778)]


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
3,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
4,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
5,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489
6,Decision_Tree_pos_enr_e1_e2,0.315221,0.315765,0.300088,0.305839,0.334134,0.338187,0.326038,0.331128
7,SVM_pos_enr_e1_e2,0.333834,0.379028,0.230944,0.241152,0.346443,0.42647,0.255052,0.274559
8,XGBoost_pos_enr_e1_e2,0.458721,0.462639,0.407345,0.408092,0.473131,0.506646,0.430826,0.437083
9,Decision_Tree_SDP,0.178025,0.152821,0.138235,0.133887,0.224257,0.212975,0.192317,0.186929


In [121]:
cols= train_pos_enr_e1e2_root_between.columns.tolist()
pickle.dump(cols, open('col.bin', "wb"))

In [146]:
## training cross-validation precision, recall, f1-score
# def error_metrics(model, features_enc, target):
#     custom_scorer = ['precision_macro', 'recall_macro', 'f1_macro']
#     scores = cross_validate(model, features_enc, target, cv=10, scoring=custom_scorer, return_train_score=False)
#     return scores['test_precision_macro'].mean(), scores['test_recall_macro'].mean(), scores['test_f1_macro'].mean()

In [110]:
pd.read_csv('results.csv')

Unnamed: 0,model_name,accuracy,precision,recall,f1,accuracy_relation,precision_relation,recall_relation,f1_relation
0,Decision_Tree_pos_enr,0.258781,0.225133,0.198501,0.179447,0.274092,0.241832,0.211707,0.198182
1,SVM_pos_enr,0.267487,0.24146,0.196879,0.165075,0.284299,0.347524,0.209312,0.184572
2,XGBoost_pos_enr,0.265386,0.170809,0.202056,0.164243,0.279796,0.240826,0.217053,0.188724
3,Decision_Tree_e1_e2,0.280096,0.270387,0.251894,0.258554,0.295707,0.292066,0.273496,0.281017
4,SVM_e1_e2,0.299009,0.361169,0.193902,0.209596,0.322426,0.396871,0.221737,0.241316
5,XGBoost_e1_e2,0.434404,0.455847,0.393957,0.394133,0.448514,0.478445,0.410567,0.41489
6,Decision_Tree_pos_enr_e1_e2,0.315221,0.315765,0.300088,0.305839,0.334134,0.338187,0.326038,0.331128
7,SVM_pos_enr_e1_e2,0.333834,0.379028,0.230944,0.241152,0.346443,0.42647,0.255052,0.274559
8,XGBoost_pos_enr_e1_e2,0.458721,0.462639,0.407345,0.408092,0.473131,0.506646,0.430826,0.437083
9,Decision_Tree_SDP,0.178025,0.152821,0.138235,0.133887,0.224257,0.212975,0.192317,0.186929


In [112]:
train_pos_enr_e1e2_root_between.shape

(17255, 618)

In [113]:
model = XGBClassifier()
model.fit(train_pos_enr_e1e2_root_between, train_y)

AttributeError: 'XGBClassifier' object has no attribute 'save'

In [118]:
# save
pickle.dump(model, open('xgboost_model.bin', "wb"))