### Import Libraries

In [39]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RANSACRegressor
from sklearn.metrics import mean_squared_error

### Import Data and Get Features and Target

In [27]:
df = pd.read_csv("train.csv", dtype=str)
test = pd.read_csv("test.csv", dtype=str)
sample = pd.read_csv("sample.csv", dtype=str)

features = df.columns[2:].tolist()
target = df.columns[1:2].tolist()
id = test.columns[0:1].tolist()

test_ids = test.loc[:, id]

X_train = df.loc[:, features]
y_train = df.loc[:, target]

X_test = test.loc[:, features]

id_submission_col_name = list(sample.columns)[0]
target_submission_col_name = list(sample.columns)[1]

### Convert DataType

In [31]:
for column in list(X_train.columns):
    X_train[column] = X_train[column].astype(float)
    
for column in list(X_test.columns):
    X_test[column] = X_test[column].astype(float)

### Check Missing Values

In [32]:
has_missing_values_train = X_train.isnull().values.any()
has_missing_values_test = X_test.isnull().values.any()

# Neither train nor test datasets contain missing values

### Standardization

In [33]:
scaler = StandardScaler().fit(X_train)
X_train_std = pd.DataFrame(scaler.transform(X_train), columns=list(X_train.columns))

### RANSAC model

In [34]:
# Using boxplots I found out the presence of outliers. Generally, regressions tasks are sensitive to those so I will use a 
# robust model, like RANSAC. Generally one should perform model selection through cross validation (for example), but for this dummy task I won't
ransac = RANSACRegressor(random_state=42).fit(X_train_std, y_train)

In [35]:
# Note: I wont't perform feature selection just for simplicity.

### Prediction

In [36]:
# Feature scaling X_test (only apply transform!)
X_test_std = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Prediction
y_pred = ransac.predict(X_test_std)

In [38]:
y_pred

array([[ -66.00242349],
       [ 451.4065044 ],
       [-461.67641706],
       ...,
       [ -35.13540942],
       [-131.67918453],
       [ 417.26915462]])

In [40]:
# Note: normally one would not have the real model to get y. Here hase been provided though
# We can compute the mean of X_test to see the RMSE 
y_real = X_test.mean(axis=1)
RMSE = mean_squared_error(y_real, y_pred)**0.5
print(RMSE)

1.8675639608563783e-13


### Format submission

In [60]:
submission = pd.DataFrame({id_submission_col_name: test_ids.values.flatten(), target_submission_col_name: y_pred.flatten()})
submission

Unnamed: 0,Id,y
0,10000,-66.002423
1,10001,451.406504
2,10002,-461.676417
3,10003,40.501209
4,10004,-126.744722
...,...,...
1995,11995,464.715255
1996,11996,496.485334
1997,11997,-35.135409
1998,11998,-131.679185


In [61]:
# Save CSV file
submission.to_csv('predictions.csv', sep = ',', index = False)