In [12]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

In [14]:
train = pd.read_csv("final_train.csv")
test = pd.read_csv("final_test.csv")

### Обучим линейную регрессию

In [20]:
lr_cols = ['tutor_rating', 'tutor_reviews', 'experience'] + [f'pca_{i}' for i in range(32)] 
X_train = train[lr_cols]
y_train = train['mean_price']
lr_params = {'fit_intercept': [True, False], 'positive': [True, False]}
linear = GridSearchCV(param_grid=lr_params,
                            estimator=LinearRegression())
linear.fit(X_train, y_train)

In [21]:
sample_submit = pd.read_csv("sample_submit.csv")

In [22]:
sample_submit['mean_price'] =linear.predict(test[lr_cols])
sample_submit

Unnamed: 0,index,mean_price
0,0,16.985531
1,1,13.952870
2,2,17.359216
3,3,13.143922
4,4,14.691458
...,...,...
1511,1511,14.896787
1512,1512,17.530929
1513,1513,14.835021
1514,1514,15.729048


In [23]:
sample_submit.to_csv('linear_denis.csv', index=None)

### Обучим случайный лес

In [27]:
cat_features = [c for c in test.columns if c not in lr_cols + ['categories', 'tutor_head_tags']]
X_train = train[lr_cols + cat_features]
y_train = train['mean_price']
encoder = TargetEncoder(cols=cat_features)
X_train = encoder.fit_transform(X_train, y_train)
forest_grid = {'n_estimators': [100, 1000, 1500], 'max_depth': [2, 3, 5]}
forest = GridSearchCV(param_grid=forest_grid,
                            estimator=RandomForestRegressor())
forest.fit(X_train, y_train)

In [31]:
forest_test = encoder.transform(test[lr_cols + cat_features])
sample_submit['mean_price'] = forest.predict(forest_test)
sample_submit

Unnamed: 0,index,mean_price
0,0,14.145234
1,1,14.156372
2,2,14.145234
3,3,14.145234
4,4,7.502170
...,...,...
1511,1511,14.145234
1512,1512,14.145234
1513,1513,14.145234
1514,1514,14.145234


In [None]:
sample_submit.to_csv('forest_denis.csv', index=None)

###  Обучим градиентный бустинг

In [34]:
cat_features = [c for c in test.columns if c not in lr_cols + ['categories', 'tutor_head_tags']]
X_train = train[lr_cols + cat_features]
y_train = train['mean_price']
encoder = TargetEncoder(cols=cat_features)
X_train = encoder.fit_transform(X_train, y_train)
forest_grid = {'n_estimators': [100, 250, 500], 'learning_rate': [0.001, 0.01, 0.1]}
boosting = GridSearchCV(param_grid=forest_grid,
                            estimator=GradientBoostingRegressor())
boosting.fit(X_train, y_train)

In [None]:
forest_test = encoder.transform(test[lr_cols + cat_features])
sample_submit['mean_price'] = boosting.predict(forest_test)
sample_submit

In [39]:
sample_submit.to_csv('boosting_denis.csv', index=None)