In [110]:
import pathlib as pl
import pandas as pd

In [111]:
DATASETS_DIR = "datasets"

TRAIN_PATH = pl.Path(f"{DATASETS_DIR}/train.csv");
TEST_PATH = pl.Path(f"{DATASETS_DIR}/test_features.csv");
SUBMISSION_PATH = pl.Path(f"{DATASETS_DIR}/sample_submission.csv");

In [112]:
df = pd.read_csv(TRAIN_PATH)

### Analysing data from the available dataset

In [113]:
df

Unnamed: 0,Sex,Length,Diameter,Height,weight_1,weight_2,weight_3,weight_4,target
0,M,0.725,0.550,0.200,1.5100,0.8735,0.4265,0.5085,9
1,F,0.415,0.340,0.130,0.3675,0.1460,0.0885,0.1200,10
2,M,0.705,0.555,0.215,2.1410,1.0465,0.3830,0.5280,11
3,I,0.375,0.290,0.095,0.2875,0.1230,0.0605,0.0800,6
4,F,0.680,0.500,0.185,1.7410,0.7665,0.3255,0.4685,12
...,...,...,...,...,...,...,...,...,...
3131,F,0.575,0.480,0.165,1.0780,0.5110,0.2095,0.3060,9
3132,M,0.585,0.455,0.125,1.0270,0.3910,0.2120,0.2500,17
3133,I,0.395,0.270,0.100,0.2985,0.1445,0.0610,0.0820,5
3134,M,0.415,0.305,0.100,0.3250,0.1560,0.0505,0.0910,6


In [114]:
df.describe()

Unnamed: 0,Length,Diameter,Height,weight_1,weight_2,weight_3,weight_4,target
count,3136.0,3136.0,3136.0,3136.0,3136.0,3136.0,3136.0,3136.0
mean,0.524141,0.407868,0.139115,0.8293,0.35913,0.180738,0.239251,9.948023
std,0.120807,0.099984,0.039111,0.492789,0.22286,0.110205,0.139756,3.290532
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.34875,0.115,0.4405,0.184375,0.092875,0.13,8.0
50%,0.545,0.425,0.14,0.801,0.336,0.171,0.2345,9.0
75%,0.615,0.48,0.165,1.1565,0.503625,0.254,0.33,11.0
max,0.815,0.65,0.515,2.8255,1.488,0.76,0.897,29.0


In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3136 entries, 0 to 3135
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sex       3136 non-null   object 
 1   Length    3136 non-null   float64
 2   Diameter  3136 non-null   float64
 3   Height    3136 non-null   float64
 4   weight_1  3136 non-null   float64
 5   weight_2  3136 non-null   float64
 6   weight_3  3136 non-null   float64
 7   weight_4  3136 non-null   float64
 8   target    3136 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 220.6+ KB


### Data preprocessing step

In [116]:
df['Sex'].replace(['F','M','I'],[0,1,2], inplace=True)

In [117]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2)

y_train, X_train = train_df['target'], train_df.drop('target', axis=1)
y_test, X_test = test_df['target'], test_df.drop('target', axis=1)

### Building model step

In [118]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse}")

RMSE: 2.164287126290977


### Trying model with test data

In [119]:
df = pd.read_csv(TEST_PATH)
df['Sex'].replace(['F','M','I'],[0,1,2], inplace=True)

In [120]:
X_output = df
y_output = lr_model.predict(X_output)

### Publishing test output to sample submission file

In [121]:
df = pd.read_csv(SUBMISSION_PATH)
df['target'] = y_output

In [122]:
df.to_csv(SUBMISSION_PATH)