# Machine learning [Kaggle](https://www.kaggle.com/c/python2020springtashkent/leaderboard) competition

In [None]:
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform

### Getting competition data

In [49]:
X_train = pd.read_csv("notebook_data/competition_input/testX.csv", index_col = 'Id').to_numpy()
Y_train = pd.read_csv("notebook_data/competition_input/trainY.csv", index_col = 'Id').Value.to_numpy()
X_test = pd.read_csv("notebook_data/competition_input/testX.csv", index_col = 'Id').to_numpy()

### PolynomialFeatures definition

In [None]:
poly = PolynomialFeatures(2)

### LinearRegression
Kaggle public test score: 0.43746 👇

In [None]:
from sklearn.linear_model import LinearRegression

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

linear = LinearRegression()
linear.fit(X, Y_train)
y_pred = linear.predict(testX).reshape(2000,)

print(cross_val_score(linear, X, Y_train, cv=5).mean())
y_pred

### AdaBoostRegressor
Kaggle public test score: 0.45256 👇

In [None]:
from sklearn.ensemble import AdaBoostRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

adaboost = AdaBoostRegressor(random_state=1, n_estimators=10)
adaboost.fit(X, Y_train)
y_pred = adaboost.predict(testX).reshape(2000,)

print(cross_val_score(adaboost, X, Y_train, cv=5).mean())
y_pred

### GradientBoostingRegressor
Kaggle public test score: 0.54379 👇

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

gbr = GradientBoostingRegressor(
    **{
        'n_estimators': 500,
        'max_depth': 4,
        'min_samples_split': 5,
        'learning_rate': 0.01,
        'loss': 'ls'
    }
)
gbr.fit(X, Y_train)
y_pred = gbr.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(gbr, X, Y_train, cv=5).mean())
y_pred

### PassiveAggressiveRegressor
Kaggle public test score: 0.64698 👇

In [None]:
from sklearn.linear_model import PassiveAggressiveRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

passive_aggressive = PassiveAggressiveRegressor(max_iter=100, random_state=0)
passive_aggressive.fit(X, Y_train)
y_pred = passive_aggressive.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(passive_aggressive, X, Y_train, cv=5).mean())
y_pred

### ARDRegression
Kaggle public test score: 0.45476 👇

In [None]:
from sklearn.linear_model import ARDRegression

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)

ard = ARDRegression(n_iter=50)
ard.fit(X, Y_train)
y_pred = ard.predict(testX).reshape(2000,)

print(cross_val_score(ard, X, Y_train, cv=5).mean())
y_pred

### Elastic Net
Kaggle public test score: 0.46244 👇

In [None]:
from sklearn.linear_model import ElasticNet

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
elastic = ElasticNet(max_iter=5000)

distributions = {
    'alpha': uniform(),
    'l1_ratio': uniform(),
}

rand_search = RandomizedSearchCV(elastic, distributions, n_iter=10, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

elastic.set_params(**search_result.best_params_)
elastic.fit(X, Y_train)
y_pred = elastic.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(elastic, X, Y_train, cv=5).mean())
y_pred

### ElasticNetCV
Kaggle public test score: 0.45422 👇

In [None]:
from sklearn.linear_model import ElasticNetCV

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
elastic_cv = ElasticNetCV(n_alphas=200, max_iter=5000)

distributions = {
    'l1_ratio': uniform(0, 1),
    'eps': uniform(1e-8, 1e-1),
    'tol': uniform(1e-8, 1e-3),
}

rand_search = RandomizedSearchCV(elastic_cv, distributions, n_iter=100, cv=2)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

elastic_cv.set_params(**search_result.best_params_)
elastic_cv.fit(X, Y_train)
y_pred = elastic_cv.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(elastic_cv, X, Y_train, cv=5).mean())
y_pred

### HuberRegressor
Kaggle public test score: 0.45547 👇

In [None]:
from sklearn.linear_model import HuberRegressor

X = poly.fit_transform(X_train)
testX = poly.fit_transform(X_test)
huber = HuberRegressor(max_iter=5000)

distributions = {
    'epsilon': uniform(1, 2),
    'alpha': uniform(1e-8, 1e-2),
    'tol': uniform(1e-8, 1e-3),
}

rand_search = RandomizedSearchCV(huber, distributions, n_iter=20, cv=5)
search_result = rand_search.fit(X, Y_train)

print('Best found parameters:', search_result.best_params_)
print('Best found score:', search_result.best_score_)

huber.set_params(**search_result.best_params_)
huber.fit(X, Y_train)
y_pred = huber.predict(testX).reshape(2000,)

print("Cross validation mean score:", cross_val_score(huber, X, Y_train, cv=5).mean())
y_pred

### Writing to file

In [None]:
Y_predicted = pd.DataFrame({'Value': y_pred, 'Id': range(len(y_pred))})
Y_predicted.to_csv('notebook_data/mullagaliev_amir.csv', index=False)