In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")

In [None]:
#Check target class
train['target'].value_counts(normalize=True)

In [None]:
#Check missing values
train.isnull().sum(axis=0)

In [None]:
train = train.drop(["target"], axis=1)
train.drop(["connection_id"], axis=1)
train["connection_id"] = range(0, len(train))            
feature_names = [x for x in train.columns if x not in ['connection_id','target']]
target = train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size = 0.7, stratify = target, random_state = 42)  

1.Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# set up model
dt_clf = DecisionTreeClassifier(max_depth=8, min_samples_split=7, max_features='sqrt')
#train model
dt_clf.fit(train[feature_names], target)
#make prediction
dt_prediction = dt_clf.predict(test[feature_names])

In [None]:
#Export to CSV
sub = pd.read_csv("sample_submission.csv")
sub['target'] = dt_prediction
sub['target'] = sub['target'].astype(int)
sub.to_csv('DecisionTree.csv', index=False)

2.Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
# set up model
lr_clf=LogisticRegression()
#train model
lr_clf.fit(train[feature_names], target)
#make prediction
lr_prediction = lr_clf.predict(test[feature_names])

In [None]:
#Export to CSV
sub = pd.read_csv("sample_submission.csv")
sub['target'] = lr_prediction
sub['target'] = sub['target'].astype(int)
sub.to_csv('LogisticRegression.csv', index=False)

3. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
#model creation
lr=LinearRegression(copy_X=False, fit_intercept=False, n_jobs=1, normalize=True)
#Train the model
lr.fit(train[feature_names],target)
#Make predictions
lr_prediction=lr.predict(test[feature_names])

In [None]:
#Export to CSV
sub = pd.read_csv("sample_submission.csv")
sub['target'] = lr_prediction
sub['target'] = sub['target'].astype(int)
sub.to_csv('LinearRegression.csv', index=False)

 4.KNN

In [None]:
from sklearn import neighbors
#model creation
knn=neighbors.KNeighborsClassifier(n_neighbors=5,algorithm='auto')
#Train the model
knn.fit(train[feature_names],target)
#Make predictions
knn_prediction=knn.predict(test[feature_names])

In [None]:
#Export to CSV
sub = pd.read_csv("sample_submission.csv")
sub['target'] = knn_prediction
sub['target'] = sub['target'].astype(int)
sub.to_csv('KNN.csv', index=False)

5.XGBoost

In [None]:
# default parameters
xgboost_params = { 
   "objective": "multi:softmax",
   "booster": "gbtree",
   "eval_metric": "auc",
   "eta": 0.3, 
   "gamma":0,
   "subsample": 1,
   "colsample_bytree": 1,
   "num_class" : 3,
   "max_depth": 6
}

In [None]:
#Convert the data into required format
xgtrain = xgb.DMatrix(X_train.values, y_train.values)
xgtest=xgb.DMatrix(X_test.values)

In [None]:
# Increase rounds until you get desired accuracy.It slows down as the number of rounds increases
boost_round = 15

# train the classifier
clf = xgb.train(xgboost_params,xgtrain,num_boost_round=boost_round,verbose_eval=20,maximize=True)

# predict
prediction_XG= clf.predict(xgtest, ntree_limit=clf.best_iteration)

# calculate accuracy
xgb_accuracy_score = accuracy_score(y_test, prediction_XG)
print("Accuracy with XGBoost Classifier-> %.2f%%" % (xgb_accuracy_score * 100.0))

In [None]:
#predict on the new data i.e Test.csv
# saving text ids for later
new_data_text_ids = pd.DataFrame(test["connection_id"])
new_data_text_ids.columns = ["connection_id"]
new_data_text_ids["connection_id"] = range(0, len(new_data_text_ids))

In [None]:
# get rid of the text data 
test.drop(["connection_id"], axis=1)
test["connection_id"] = range(0, len(test))

In [None]:
# convert to XGBoost format
xg_new_data = xgb.DMatrix(test.values)

In [None]:
# predict and export predictions to CSV
xgb_prediction = clf.predict(xg_new_data, ntree_limit=clf.best_iteration)
results = pd.DataFrame(xgb_prediction)
results.columns = ["pred_target"]
results["connection_id"] = range(0, len(results))
new_result = pd.merge(results, new_data_text_ids, left_on="connection_id", right_on="connection_id")
final_result = new_result[["connection_id", "pred_target"]]
final_result.to_csv("XGBoost.csv")
