In [1]:
import numpy as np 
import pandas as pd

In [15]:
#version 1: modify dates + (null -> 0) + delete rows
def clean_data(dataset):
  # 1. converting date string to date time
  date_columns = dataset[['date_close', 'deal_start_date_express', 'plan_end_date_express', 'deal_start_date_product', 'plan_end_date_product']]
  for col in date_columns:
    # date_close, ...
    year_cols_name = col+'_year'
    year_cols = []
    month_cols_name = col+'_month'
    month_cols = []
    day_cols_name = col+'_day'
    day_cols = []
    for val in date_columns[col]:
        if type(val) == str:
            dtime = pd.to_datetime(val)
            year_cols.append(dtime.year)
            month_cols.append(dtime.month)
            day_cols.append(dtime.day)
        else:
            year_cols.append(val)
            month_cols.append(val)
            day_cols.append(val)
    
    dataset.insert(0, year_cols_name, year_cols)
    dataset.insert(1, month_cols_name, month_cols)
    dataset.insert(2, day_cols_name, day_cols) 
    dataset.drop(col, axis=1, inplace=True)
  # 2. dropping redundant rows
  dataset.dropna(axis=0, inplace=True, thresh = len(dataset.columns)*0.5)

  #dataset.dropna(axis=1, thresh = len(dataset)*0.7, inplace=True) # delete the columns > 80% of which are NULL
  dataset.fillna(0, inplace=True)
  #convert dates into pandas datetime and add columns for day, month, year

  return dataset

In [16]:
train_dataset = pd.read_csv('train_data.csv') # 47 features/columns, over 900k samples
# descriptions of columns can be found in 'info.json'

In [17]:
test_dataset = pd.read_csv('test_data.csv')

Cleaning the datasets

In [18]:
train_set = clean_data(train_dataset) # 89k rows left upon dropping

In [20]:
test_set = clean_data(test_dataset)

In [21]:
x_train = train_set.iloc[:, 0:-1] 
y_train = train_set.iloc[:, -1] 

In [22]:
x_test = train_set.iloc[:, 0:-1]
y_test = train_set.iloc[:, -1]

Training the models

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

Learning Hyperparameters for KNN

In [50]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
k_param = range(1, 26, 2)
k_results = []
for k in k_param:
  knn = KNeighborsClassifier(n_neighbors = k)
  scores = cross_val_score(knn, x_train, y_train, cv = 10, scoring = 'accuracy')
  k_results.append(scores.mean())
print(k_results)

In [None]:
plt.plot(k_param, k_results)
plt.xlabel('Value of "k" for KNN')
plt.ylabel('Cross-Validated Accuracy')

In [28]:
X_train_cr, x_validation, y_train_cr, y_validation_cr = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

In [41]:
cv = KFold(n_splits=10)

In [42]:
model_KN = KNeighborsClassifier(n_neighbors=10)
cv_results_KN = cross_val_score(model_KN, X_train_cr, y_train_cr, cv=cv)
print(cv_results_KN)
print('%s: %f (%f)' % ("KNeighbors", cv_results_KN.mean(), cv_results_KN.std()))

[0.92669278 0.922216   0.92654261 0.92486358 0.92528334 0.92444382
 0.92332447 0.9250035  0.92262488 0.92514342]
KNeighbors: 0.924614 (0.001428)


In [43]:
model_RF = RandomForestClassifier(n_estimators = 100)
cv_results_RF = cross_val_score(model_RF, X_train_cr, y_train_cr, cv=cv)
print(cv_results_RF)
print('%s: %f (%f)' % ("RandomForest", cv_results_RF.mean(), cv_results_RF.std()))

[0.94935646 0.95075546 0.95186792 0.951728   0.95004897 0.9510284
 0.94753043 0.94976913 0.94487197 0.95060865]
RandomForest: 0.949757 (0.002019)


In [44]:
model_LR = LogisticRegression()
cv_results_LR = cross_val_score(model_LR, X_train_cr, y_train_cr, cv=cv)
print(cv_results_LR)
print('%s: %f (%f)' % ("LogReg", cv_results_LR.mean(), cv_results_LR.std()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[0.94082261 0.9426413  0.94291311 0.93801595 0.93885546 0.93857563
 0.93549741 0.93955506 0.93745628 0.94011473]
LogReg: 0.939445 (0.002175)


In [45]:
model_NN = MLPClassifier()
cv_results_NN = cross_val_score(model_NN, X_train_cr, y_train_cr, cv=cv)
print(cv_results_NN)
print('%s: %f (%f)' % ("NeuralNetworks", cv_results_NN.mean(), cv_results_NN.std()))

[0.94012311 0.93774482 0.74352875 0.93857563 0.8938016  0.93535749
 0.93535749 0.93353855 0.93787603 0.93283895]
NeuralNetworks: 0.912874 (0.057901)


In [48]:
model_PLA = Perceptron()
cv_results_PLA = cross_val_score(model_PLA, X_train_cr, y_train_cr, cv=cv)
print(cv_results_PLA)
print('%s: %f (%f)' % ("PLA", cv_results_PLA.mean(), cv_results_PLA.std()))

[0.93928372 0.94124231 0.92766196 0.92864139 0.90121729 0.93102001
 0.93479782 0.9317196  0.93451798 0.92654261]
PLA: 0.929664 (0.010519)


Testing the Best Model on the test set

In [47]:
model_RF.fit(x_train, y_train)
print(model_RF.score(x_test, y_test))

0.9999552272218492
