In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

from sys import platform

# Instructions

1. Load the `train.csv` file
2. Explore the data, understand it
3. Process it for future training
4. Do train, test, split for your `train.csv` file
5. `fit/train` a model from your cleaned_train_df
-----
6. Load the `test.csv` file
7. Apply the same processing you did to `train.csv` into `test.csv`
8. `predict` the price for that file
9. Only keep the columns you need
10. Export
------
11. Repeat! 🚀🔥

# Import the csv files

In [22]:
df_train = pd.read_csv('data/train.csv') #, index_col=0)
df_test = pd.read_csv('data/test.csv') #, index_col=0)

# Cleaning, processing, feature selection, etc

In [23]:
# Processing is necessary, otherwise we won't be able to fit a model
# For the sake of the example, we'll just drop categorical columns

df_train_cleaned = df_train.select_dtypes(exclude='object')
print(df_train_cleaned.shape[0])
df_train_cleaned

40455


Unnamed: 0,id,carat,depth,table,x,y,z,price
0,0,1.02,63.2,58.0,6.36,6.40,4.03,8.928
1,1,0.35,61.0,57.0,4.54,4.57,2.77,6.477
2,2,0.31,60.5,58.0,4.43,4.40,2.67,6.810
3,3,0.38,61.4,56.0,4.66,4.69,2.87,6.824
4,4,1.64,61.8,56.0,7.59,7.60,4.69,9.776
...,...,...,...,...,...,...,...,...
40450,40450,1.20,62.2,55.0,6.77,6.81,4.23,9.149
40451,40451,1.50,64.2,56.0,7.30,7.09,4.62,9.077
40452,40452,1.06,61.9,55.0,6.54,6.58,4.06,8.892
40453,40453,0.31,60.1,58.0,4.40,4.38,2.64,6.385


# Train on train.csv

## Train, test split

In [24]:
X = df_train_cleaned.iloc[:,:-1]
y = df_train_cleaned['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

## Fit

In [25]:
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

# Just for feedback
if platform == "darwin":
    os.system("say -v Monica ayam don treinin")

# Applying same cleaning & processing to my `test.csv`

In [26]:
df_test_cleaned = df_test.select_dtypes(exclude='object')
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
10268,10268,0.7,62.9,61.0,5.64,5.62,3.54


# Predict on the `test.csv`

In [27]:
y_pred = regressor.predict(df_test_cleaned)
y_pred

# Just for feedback
if platform == "darwin":
    os.system("say -v Monica ayam don predictin")

# DF with two columns

In [28]:
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
8105,8105,0.52,61.3,56.0,5.18,5.21,3.18


In [29]:
df_test_cleaned['price'] = y_pred # Adding the predicted price
df_for_submission = df_test_cleaned[["id", "price"]] # Modifying for subnmission

In [30]:
print(df_for_submission.shape[0])
df_for_submission.sample()

13485


Unnamed: 0,id,price
3041,3041,7.775104


# Export (index=False)

In [31]:
df_for_submission.to_csv("data/my_submission.csv", index=False)

# Just for feedback
if platform == "darwin":
    os.system("say -v Monica redi for submission")

In [32]:
!open .

'open' is not recognized as an internal or external command,
operable program or batch file.
