# Machine learning [Kaggle](https://www.kaggle.com/c/python2020springtashkent/leaderboard) competition

In [0]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform

### Getting competition data

In [0]:
X_train = pd.read_csv("trainX.csv", index_col = 'Id').to_numpy()
Y_train = pd.read_csv("trainY.csv", index_col = 'Id').Value.to_numpy()
X_test = pd.read_csv("testX.csv", index_col = 'Id').to_numpy()

### PolynomialFeatures definition

In [0]:
poly = PolynomialFeatures(2)

### LinearRegression
Kaggle public test score: 0.43746 👇

In [0]:
from sklearn.linear_model import LinearRegression

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

linear = LinearRegression()
linear.fit(X, Y_train)
y_pred = linear.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(linear, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: 0.3015211599362624


array([23.95507812, 32.609375  , 27.62890625, ..., 23.5703125 ,
       28.68164062, 27.42773438])

### AdaBoostRegressor from ensemble
Kaggle public test score: 0.45256 👇

In [0]:
from sklearn.ensemble import AdaBoostRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

adaboost = AdaBoostRegressor(random_state=1, n_estimators=10)
adaboost.fit(X, Y_train)
y_pred = adaboost.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(adaboost, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: 0.25955985392184266


array([29.09880147, 31.6754819 , 30.61175447, ..., 27.92263091,
       31.49051035, 29.03788317])

### GradientBoostingRegressor from ensemble
Kaggle public test score: 0.54379 👇

In [0]:
from sklearn.ensemble import GradientBoostingRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

gbr = GradientBoostingRegressor(
    **{
        'n_estimators': 50,
        'max_depth': 4,
        'min_samples_split': 5,
        'learning_rate': 0.01,
        'loss': 'ls'
    }
)
gbr.fit(X, Y_train)
y_pred = gbr.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(gbr, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: 0.16007057563783308


array([30.74411613, 33.26659839, 31.86835632, ..., 31.27416226,
       31.68948294, 31.00055132])

### PassiveAggressiveRegressor
Kaggle public test score: 0.64698 👇

In [0]:
from sklearn.linear_model import PassiveAggressiveRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

passive_aggressive = PassiveAggressiveRegressor(max_iter=10000, random_state=0)
passive_aggressive.fit(X, Y_train)
y_pred = passive_aggressive.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(passive_aggressive, X, Y_train, cv=3).mean())
y_pred

Cross validation mean score: -0.7212717507029321


array([19.72246023, 34.83825565, 29.16675255, ..., 26.77971688,
       32.18134777, 29.75120905])

### PassiveAggressiveRegressor with cross validation
Kaggle public test score: 0.42344 👇

In [0]:
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.model_selection import cross_validate

cv_num = 3

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
passive_aggressive = PassiveAggressiveRegressor(max_iter=10000, random_state=0)
cv_results = cross_validate(passive_aggressive, X, Y_train, cv=cv_num, return_estimator=True)
cv_estimators = cv_results['estimator']
predictions = np.zeros(shape=(cv_num, 2000))
scores = np.zeros(cv_num)

for i, estimator in enumerate(cv_estimators):
    predictions[i] = estimator.predict(testX)
    scores[i] = estimator.score(X, Y_train)

y_pred = predictions.mean(axis=0)
print("Mean cv score:", scores.mean())
print("Prediction:", y_pred)

Mean cv score: -0.3362297465986151
Prediction: [22.46201319 25.68330453 24.69278945 ... 21.20174213 28.5082761
 27.00195835]


### ARDRegression
Kaggle public test score: 0.45476 👇

In [0]:
from sklearn.linear_model import ARDRegression

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

ard = ARDRegression(n_iter=50)
ard.fit(X, Y_train)
y_pred = ard.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(ard, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: -0.05196644525329823


array([34.89034626, 32.84161666, 33.81696983, ..., 33.45669655,
       32.03132115, 32.60130462])

### Elastic Net
Kaggle public test score: 0.46244 👇

In [0]:
from sklearn.linear_model import ElasticNet

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
elastic = ElasticNet(max_iter=5000)

distributions = {
    'alpha': uniform(),
    'l1_ratio': uniform(),
}

rand_search = RandomizedSearchCV(elastic, distributions, n_iter=10, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

elastic.set_params(**search_result.best_params_)
elastic.fit(X, Y_train)
y_pred = elastic.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(elastic, X, Y_train, cv=5).mean())
y_pred

Best found parameters: {'alpha': 0.6195774747630026, 'l1_ratio': 0.061005161778817873}
Best found score: -0.01697850039573674
Cross validation mean score: -0.01697850039573674


array([32.71842963, 32.57878586, 32.80358982, ..., 32.57662461,
       32.43487262, 32.71616535])

### ElasticNetCV
Kaggle public test score: 0.45422 👇

In [0]:
from sklearn.linear_model import ElasticNetCV

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
elastic_cv = ElasticNetCV(n_alphas=200, max_iter=5000)

distributions = {
    'l1_ratio': uniform(0, 1),
    'eps': uniform(1e-8, 1e-1),
    'tol': uniform(1e-8, 1e-3),
}

rand_search = RandomizedSearchCV(elastic_cv, distributions, n_iter=100, cv=2)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

elastic_cv.set_params(**search_result.best_params_)
elastic_cv.fit(X, Y_train)
y_pred = elastic_cv.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(elastic_cv, X, Y_train, cv=5).mean())
y_pred

Best found parameters: {'eps': 0.010510664720959035, 'l1_ratio': 0.01762320360981562, 'tol': 0.000999149310780983}
Best found score: -0.014056040370265643
Cross validation mean score: -0.01871296272853633


array([32.6814757 , 32.57220592, 32.77577072, ..., 32.59034811,
       32.45719409, 32.68301173])

### HuberRegressor
Kaggle public test score: 0.45547 👇

In [0]:
from sklearn.linear_model import HuberRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
huber = HuberRegressor(max_iter=5000)

distributions = {
    'epsilon': uniform(1, 2),
    'alpha': uniform(1e-8, 1e-2),
    'tol': uniform(1e-8, 1e-3),
}

rand_search = RandomizedSearchCV(huber, distributions, n_iter=20, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

huber.set_params(**search_result.best_params_)
huber.fit(X, Y_train)
y_pred = huber.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(huber, X, Y_train, cv=5).mean())
y_pred

Best found parameters: {'alpha': 0.00993503026872606, 'epsilon': 2.1983075952679583, 'tol': 0.0008828064754212744}
Best found score: -0.15846768902173114
Cross validation mean score: -0.15846768902173114


array([33.26883506, 30.85389352, 36.16050022, ..., 34.4372746 ,
       32.70028724, 35.28879432])

### LinearSVR
Kaggle public test score: - 👇

In [0]:
from sklearn.svm import LinearSVR

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

linear_svr = LinearSVR(max_iter=50000)

distributions = {
    'epsilon': uniform(0.0001, 1),
    'C': uniform(1, 1.5),
    'tol': uniform(1e-8, 1e-4),
}

rand_search = RandomizedSearchCV(linear_svr, distributions, n_iter=100, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

linear_svr.set_params(**search_result.best_params_)

linear_svr.fit(X, Y_train)
y_pred = linear_svr.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(passive_aggressive, X, Y_train, cv=5).mean())
y_pred

Best found parameters: {'C': 1.04022223043428, 'epsilon': 0.8109006958114781, 'tol': 1.201230247417471e-05}
Best found score: 0.34365902685672517
Cross validation mean score: -0.9277521496759714


array([24.09945927, 31.40321651, 27.51120302, ..., 24.044231  ,
       28.72179264, 27.22568184])

### Ensemble AdaBoostRegressor with PassiveAggressiveRegressor as base estimator
Kaggle public test score: 0.28720 👇

In [0]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import PassiveAggressiveRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
passive_aggressive = PassiveAggressiveRegressor(max_iter=10000, random_state=0)

adaboost = AdaBoostRegressor(base_estimator=passive_aggressive, random_state=0, n_estimators=18)
adaboost.fit(X, Y_train)
y_pred = adaboost.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(adaboost, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: 0.1805607603637704


array([22.88865718, 29.94949951, 29.8929903 , ..., 24.55872556,
       26.65118825, 31.08964781])

### BaggingRegressor with PassiveAggressiveRegressor as base estimator
Kaggle public test score: 0.28275 👇

In [0]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import PassiveAggressiveRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
passive_aggressive = PassiveAggressiveRegressor(max_iter=10000, random_state=0)

bagging = BaggingRegressor(base_estimator=passive_aggressive, random_state=0, n_estimators=20)
bagging.fit(X, Y_train)
y_pred = bagging.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(bagging, X, Y_train, cv=2).mean())
y_pred

Cross validation mean score: -0.13536868923336814


array([20.50034259, 29.56928272, 25.43463344, ..., 20.37647211,
       27.18622073, 25.98046165])

In [0]:
!pip install sklearn



### RANSACRegressor with PassiveAggressiveRegressor as base estimator
Kaggle public test score: 0.20414 👇

In [0]:
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import PassiveAggressiveRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

passive_aggressive = PassiveAggressiveRegressor(max_iter=10000)

ransac = RANSACRegressor(base_estimator=passive_aggressive, random_state=0)
ransac.fit(X, Y_train)
y_pred = ransac.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(ransac, X, Y_train, cv=3).mean())
y_pred

Cross validation mean score: -1.9837386268052946


array([28.91671581, 29.96678688, 29.74476394, ..., 22.89874605,
       29.8683492 , 20.89589256])

### KernelRidge
Kaggle public test score: 0.64749 👇

In [0]:
from sklearn.kernel_ridge import KernelRidge

kernel_ridge = KernelRidge(alpha=0.00001, kernel='laplacian', degree=2).fit(X_train, Y_train)
y_pred = kernel_ridge.predict(X_test).reshape(2000,)

print("Cross validation mean score:", cross_val_score(kernel_ridge, X, Y_train, cv=3).mean())
print(y_pred)

### KernelRidge with PassiveAggressiveRegressor
Kaggle score: 0.74813 👇

In [0]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import PassiveAggressiveRegressor

X = X_train
testX = X_test

distributions = {
    'gamma': uniform(),
}

# KernelRidge with randomized search

kernel_ridge = KernelRidge(alpha=0.00001, kernel='laplacian', degree=2)

rand_search = RandomizedSearchCV(kernel_ridge, distributions, n_iter=100, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

kernel_ridge.set_params(**search_result.best_params_)

kernel_ridge.fit(X, Y_train)
y_pred_1 = kernel_ridge.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(kernel_ridge, X, Y_train, cv=3).mean())
print(y_pred_1)

# PassiveAggressive

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

passive_aggressive = PassiveAggressiveRegressor(max_iter=10000, random_state=0)
passive_aggressive.fit(X, Y_train)
y_pred_2 = passive_aggressive.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(passive_aggressive, X, Y_train, cv=3).mean())
print(y_pred_2)

y_pred = (y_pred_1 + y_pred_2) / 2
y_pred

Best found parameters: {'gamma': 0.12164197894286488}
Best found score: 0.7088881839741811
Cross validation mean score: 0.70521542886137
[24.17586407 23.85556788 25.17076262 ... 20.42674991 27.86528831
 18.72382508]
Cross validation mean score: -0.7212717507029321
[19.72246023 34.83825565 29.16675255 ... 26.77971688 32.18134777
 29.75120905]


array([21.94916215, 29.34691176, 27.16875758, ..., 23.60323339,
       30.02331804, 24.23751706])

### KernelRidge with PassiveAggressiveRegressor
KernelRidge for different alphas mean

Kaggle score: 0.81800

In [54]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import PassiveAggressiveRegressor


# PassiveAggressive

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

passive_aggressive = PassiveAggressiveRegressor(max_iter=10000, random_state=0)
passive_aggressive.fit(X, Y_train)
y_pred_2 = passive_aggressive.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(passive_aggressive, X, Y_train, cv=3).mean())
print(y_pred_2)


# KernelRidge

y_pred = y_pred_2

kernel_ridge = KernelRidge()

alphas = [0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.05, 0.04, 0.06, 0.065, 0.07]

for alpha in alphas:
    kernel_ridge.set_params(alpha=alpha, kernel='laplacian', degree=2)
    kernel_ridge.fit(X_train, Y_train)
    y_pred  += kernel_ridge.predict(X_test).reshape(2000,)
    print(kernel, "kernel", "cross validation mean score:", cross_val_score(kernel_ridge, X_train, Y_train, cv=3).mean())
    # print(y_pred_1)

y_pred = y_pred / len(alphas) + 1

y_pred

Cross validation mean score: -0.7212717507029321
[19.72246023 34.83825565 29.16675255 ... 26.77971688 32.18134777
 29.75120905]
laplacian kernel cross validation mean score: 0.6862734666446514
laplacian kernel cross validation mean score: 0.6862737091575467
laplacian kernel cross validation mean score: 0.686276134664897
laplacian kernel cross validation mean score: 0.6863004269822724
laplacian kernel cross validation mean score: 0.6865465408378317
laplacian kernel cross validation mean score: 0.6890463926133839
laplacian kernel cross validation mean score: 0.7037815657239376
laplacian kernel cross validation mean score: 0.7142928252941898
laplacian kernel cross validation mean score: 0.7170250672402395
laplacian kernel cross validation mean score: 0.716256664156791
laplacian kernel cross validation mean score: 0.7171419939136449
laplacian kernel cross validation mean score: 0.7170259587835114
laplacian kernel cross validation mean score: 0.7168187852660424


array([28.01663878, 28.66619997, 29.12000313, ..., 23.78099076,
       31.36047422, 22.79485511])

In [65]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import PassiveAggressiveRegressor


# PassiveAggressive

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

passive_aggressive = PassiveAggressiveRegressor(max_iter=10000, random_state=0)
passive_aggressive.fit(X, Y_train)
y_pred_2 = passive_aggressive.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(passive_aggressive, X, Y_train, cv=3).mean())
print(y_pred_2)


# KernelRidge

y_pred = y_pred_2

kernel_ridge = KernelRidge()
# 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 
alphas = [0.001, 0.01, 0.1, 0.05, 0.04, 0.06, 0.065, 0.07, 0.075, 0.08, 0.09, .11]

for alpha in alphas:
    kernel_ridge.set_params(alpha=alpha, kernel='laplacian', degree=2)
    kernel_ridge.fit(X_train, Y_train)
    y_pred  += kernel_ridge.predict(X_test).reshape(2000,)
    print(kernel, "kernel", "cross validation mean score:", cross_val_score(kernel_ridge, X_train, Y_train, cv=3).mean())
    # print(y_pred_1)

y_pred = y_pred / len(alphas) + 1

y_pred

Cross validation mean score: -0.7212717507029321
[19.72246023 34.83825565 29.16675255 ... 26.77971688 32.18134777
 29.75120905]
laplacian kernel cross validation mean score: 0.6890463926133839
laplacian kernel cross validation mean score: 0.7037815657239376
laplacian kernel cross validation mean score: 0.7142928252941898
laplacian kernel cross validation mean score: 0.7170250672402395
laplacian kernel cross validation mean score: 0.716256664156791
laplacian kernel cross validation mean score: 0.7171419939136449
laplacian kernel cross validation mean score: 0.7170259587835114
laplacian kernel cross validation mean score: 0.7168187852660424
laplacian kernel cross validation mean score: 0.7165345749963626
laplacian kernel cross validation mean score: 0.7161847211187414
laplacian kernel cross validation mean score: 0.7153237186607734
laplacian kernel cross validation mean score: 0.7131322876640883


array([28.65577179, 30.11947859, 29.61372794, ..., 25.05875935,
       31.75722182, 24.50889078])

### Writing to file

In [0]:
Y_predicted = pd.DataFrame({'Value': y_pred, 'Id': range(len(y_pred))})
Y_predicted.to_csv('mullagaliev_amir.csv', index=False)