# Machine learning [Kaggle](https://www.kaggle.com/c/python2020springtashkent/leaderboard) competition

In [1]:
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform

### Getting competition data

In [2]:
X_train = pd.read_csv("notebook_data/competition_input/testX.csv", index_col = 'Id').to_numpy()
Y_train = pd.read_csv("notebook_data/competition_input/trainY.csv", index_col = 'Id').Value.to_numpy()
X_test = pd.read_csv("notebook_data/competition_input/testX.csv", index_col = 'Id').to_numpy()

### PolynomialFeatures definition

In [3]:
poly = PolynomialFeatures(2)

### LinearRegression
Kaggle public test score: 0.43746 👇

In [4]:
from sklearn.linear_model import LinearRegression

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

linear = LinearRegression()
linear.fit(X, Y_train)
y_pred = linear.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(linear, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: -0.16521103479844174


array([33.125   , 30.640625, 35.984375, ..., 34.84375 , 32.484375,
       35.203125])

### AdaBoostRegressor
Kaggle public test score: 0.45256 👇

In [5]:
from sklearn.ensemble import AdaBoostRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

adaboost = AdaBoostRegressor(random_state=1, n_estimators=10)
adaboost.fit(X, Y_train)
y_pred = adaboost.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(adaboost, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: -0.037967005933217376


array([33.21082726, 32.12331659, 33.21082726, ..., 32.93807552,
       31.78211262, 33.21082726])

### GradientBoostingRegressor
Kaggle public test score: 0.54379 👇

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

gbr = GradientBoostingRegressor(
    **{
        'n_estimators': 50,
        'max_depth': 4,
        'min_samples_split': 5,
        'learning_rate': 0.01,
        'loss': 'ls'
    }
)
gbr.fit(X, Y_train)
y_pred = gbr.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(gbr, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: -0.024473972817638013


array([33.58145805, 32.56702345, 32.63200975, ..., 32.84995487,
       32.4807738 , 32.61953046])

### PassiveAggressiveRegressor
Kaggle public test score: 0.64698 👇

In [7]:
from sklearn.linear_model import PassiveAggressiveRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

passive_aggressive = PassiveAggressiveRegressor(max_iter=100, random_state=0)
passive_aggressive.fit(X, Y_train)
y_pred = passive_aggressive.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(passive_aggressive, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: -1.2664023022754967


array([20.87751328, 31.70860569, 33.13074241, ..., 42.39652048,
       25.89452714, 34.03770407])

### ARDRegression
Kaggle public test score: 0.45476 👇

In [8]:
from sklearn.linear_model import ARDRegression

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

ard = ARDRegression(n_iter=50)
ard.fit(X, Y_train)
y_pred = ard.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(ard, X, Y_train, cv=5).mean())
y_pred

Cross validation mean score: -0.05196644525329823


array([34.89034626, 32.84161666, 33.81696983, ..., 33.45669655,
       32.03132115, 32.60130462])

### Elastic Net
Kaggle public test score: 0.46244 👇

In [9]:
from sklearn.linear_model import ElasticNet

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
elastic = ElasticNet(max_iter=5000)

distributions = {
    'alpha': uniform(),
    'l1_ratio': uniform(),
}

rand_search = RandomizedSearchCV(elastic, distributions, n_iter=10, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

elastic.set_params(**search_result.best_params_)
elastic.fit(X, Y_train)
y_pred = elastic.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(elastic, X, Y_train, cv=5).mean())
y_pred

Best found parameters: {'alpha': 0.6195774747630026, 'l1_ratio': 0.061005161778817873}
Best found score: -0.01697850039573674
Cross validation mean score: -0.01697850039573674


array([32.71842963, 32.57878586, 32.80358982, ..., 32.57662461,
       32.43487262, 32.71616535])

### ElasticNetCV
Kaggle public test score: 0.45422 👇

In [10]:
from sklearn.linear_model import ElasticNetCV

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
elastic_cv = ElasticNetCV(n_alphas=200, max_iter=5000)

distributions = {
    'l1_ratio': uniform(0, 1),
    'eps': uniform(1e-8, 1e-1),
    'tol': uniform(1e-8, 1e-3),
}

rand_search = RandomizedSearchCV(elastic_cv, distributions, n_iter=100, cv=2)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

elastic_cv.set_params(**search_result.best_params_)
elastic_cv.fit(X, Y_train)
y_pred = elastic_cv.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(elastic_cv, X, Y_train, cv=5).mean())
y_pred

Best found parameters: {'eps': 0.010510664720959035, 'l1_ratio': 0.01762320360981562, 'tol': 0.000999149310780983}
Best found score: -0.014056040370265643
Cross validation mean score: -0.01871296272853633


array([32.6814757 , 32.57220592, 32.77577072, ..., 32.59034811,
       32.45719409, 32.68301173])

### HuberRegressor
Kaggle public test score: 0.45547 👇

In [11]:
from sklearn.linear_model import HuberRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
huber = HuberRegressor(max_iter=5000)

distributions = {
    'epsilon': uniform(1, 2),
    'alpha': uniform(1e-8, 1e-2),
    'tol': uniform(1e-8, 1e-3),
}

rand_search = RandomizedSearchCV(huber, distributions, n_iter=20, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

huber.set_params(**search_result.best_params_)
huber.fit(X, Y_train)
y_pred = huber.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(huber, X, Y_train, cv=5).mean())
y_pred

Best found parameters: {'alpha': 0.00993503026872606, 'epsilon': 2.1983075952679583, 'tol': 0.0008828064754212744}
Best found score: -0.15846768902173114
Cross validation mean score: -0.15846768902173114


array([33.26883506, 30.85389352, 36.16050022, ..., 34.4372746 ,
       32.70028724, 35.28879432])

### Writing to file

In [12]:
Y_predicted = pd.DataFrame({'Value': y_pred, 'Id': range(len(y_pred))})
Y_predicted.to_csv('notebook_data/mullagaliev_amir.csv', index=False)