In [1]:
import sys
in_colab = 'google.colab' in sys.modules

if in_colab:
    # Install packages in Colab
    !pip install category_encoders==2.0.0
    !pip install pandas-profiling==2.3.0
    !pip install plotly==4.1.1

In [2]:
import pandas as pd
import warnings

# squelch warnings
warnings.filterwarnings('ignore')

url = "https://raw.githubusercontent.com/strangelycutlemon/camera_prices/master/cameras.csv"

df = pd.read_csv(url)
df.head()

Unnamed: 0,Model,Release date,Max resolution,Low resolution,Effective pixels,Zoom wide (W),Zoom tele (T),Normal focus range,Macro focus range,Storage included,Weight (inc. batteries),Dimensions,Price
0,Agfa ePhoto 1280,1997,1024.0,640.0,0.0,38.0,114.0,70.0,40.0,4.0,420.0,95.0,179.0
1,Agfa ePhoto 1680,1998,1280.0,640.0,1.0,38.0,114.0,50.0,0.0,4.0,420.0,158.0,179.0
2,Agfa ePhoto CL18,2000,640.0,0.0,0.0,45.0,45.0,0.0,0.0,2.0,0.0,0.0,179.0
3,Agfa ePhoto CL30,1999,1152.0,640.0,0.0,35.0,35.0,0.0,0.0,4.0,0.0,0.0,269.0
4,Agfa ePhoto CL30 Clik!,1999,1152.0,640.0,0.0,43.0,43.0,50.0,0.0,40.0,300.0,128.0,1299.0


In [3]:
# import seaborn as sns

# sns.pairplot(df)

In [4]:
df.describe()

Unnamed: 0,Release date,Max resolution,Low resolution,Effective pixels,Zoom wide (W),Zoom tele (T),Normal focus range,Macro focus range,Storage included,Weight (inc. batteries),Dimensions,Price
count,1038.0,1038.0,1038.0,1038.0,1038.0,1038.0,1038.0,1037.0,1036.0,1036.0,1036.0,1038.0
mean,2003.590559,2474.672447,1773.936416,4.596339,32.963391,121.525048,44.145472,7.78785,17.447876,319.265444,105.363417,457.384393
std,2.724755,759.513608,830.897955,2.844044,10.333149,93.455422,24.141959,8.100081,27.440655,260.410137,24.262761,760.452918
min,1994.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
25%,2002.0,2048.0,1120.0,3.0,35.0,96.0,30.0,3.0,8.0,180.0,92.0,149.0
50%,2004.0,2560.0,2048.0,4.0,36.0,108.0,50.0,6.0,16.0,226.0,101.0,199.0
75%,2006.0,3072.0,2560.0,7.0,38.0,117.0,60.0,10.0,20.0,350.0,115.0,399.0
max,2007.0,5616.0,4992.0,21.0,52.0,518.0,120.0,85.0,450.0,1860.0,240.0,7999.0


In [5]:
df.shape

(1038, 13)

In [6]:
# Check datatypes, null value counts, and unique values
pd.concat([df.isnull().sum(), df.nunique(), df.dtypes], axis=1, keys=['isnull', 'nunique', 'dtypes'])

Unnamed: 0,isnull,nunique,dtypes
Model,0,1038,object
Release date,0,14,int64
Max resolution,0,99,float64
Low resolution,0,70,float64
Effective pixels,0,16,float64
Zoom wide (W),0,25,float64
Zoom tele (T),0,100,float64
Normal focus range,0,32,float64
Macro focus range,1,29,float64
Storage included,2,44,float64


In [7]:
# Remove spaces from column names
df.columns = df.columns.str.replace(' ', '_')
df.head()

Unnamed: 0,Model,Release_date,Max_resolution,Low_resolution,Effective_pixels,Zoom_wide_(W),Zoom_tele_(T),Normal_focus_range,Macro_focus_range,Storage_included,Weight_(inc._batteries),Dimensions,Price
0,Agfa ePhoto 1280,1997,1024.0,640.0,0.0,38.0,114.0,70.0,40.0,4.0,420.0,95.0,179.0
1,Agfa ePhoto 1680,1998,1280.0,640.0,1.0,38.0,114.0,50.0,0.0,4.0,420.0,158.0,179.0
2,Agfa ePhoto CL18,2000,640.0,0.0,0.0,45.0,45.0,0.0,0.0,2.0,0.0,0.0,179.0
3,Agfa ePhoto CL30,1999,1152.0,640.0,0.0,35.0,35.0,0.0,0.0,4.0,0.0,0.0,269.0
4,Agfa ePhoto CL30 Clik!,1999,1152.0,640.0,0.0,43.0,43.0,50.0,0.0,40.0,300.0,128.0,1299.0


In [8]:
df = df.rename(columns={'Zoom_wide_(W)':'Zoom_wide',
                        'Zoom_tele_(T)':'Zoom_tele',
                        'Weight_(inc._batteries)':'Weight'})

In [9]:
# Drop all 7 rows with nulls
df = df.dropna(axis=0)

In [10]:
# Create Brand feature
df['Brand'] = df['Model'].apply(lambda x: x.split()[0])
df.head()

Unnamed: 0,Model,Release_date,Max_resolution,Low_resolution,Effective_pixels,Zoom_wide,Zoom_tele,Normal_focus_range,Macro_focus_range,Storage_included,Weight,Dimensions,Price,Brand
0,Agfa ePhoto 1280,1997,1024.0,640.0,0.0,38.0,114.0,70.0,40.0,4.0,420.0,95.0,179.0,Agfa
1,Agfa ePhoto 1680,1998,1280.0,640.0,1.0,38.0,114.0,50.0,0.0,4.0,420.0,158.0,179.0,Agfa
2,Agfa ePhoto CL18,2000,640.0,0.0,0.0,45.0,45.0,0.0,0.0,2.0,0.0,0.0,179.0,Agfa
3,Agfa ePhoto CL30,1999,1152.0,640.0,0.0,35.0,35.0,0.0,0.0,4.0,0.0,0.0,269.0,Agfa
4,Agfa ePhoto CL30 Clik!,1999,1152.0,640.0,0.0,43.0,43.0,50.0,0.0,40.0,300.0,128.0,1299.0,Agfa


In [11]:
# create list of features
features = df.columns.tolist()

# remove Model due to high cardinality
features.remove('Model')
features.remove('Price')

In [12]:
features

['Release_date',
 'Max_resolution',
 'Low_resolution',
 'Effective_pixels',
 'Zoom_wide',
 'Zoom_tele',
 'Normal_focus_range',
 'Macro_focus_range',
 'Storage_included',
 'Weight',
 'Dimensions',
 'Brand']

In [13]:
# Create test and validation sets
from sklearn.model_selection import train_test_split

target = 'Price'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(621, 12) (621,) (207, 12) (207,) (208, 12) (208,)


In [20]:
!pip install category_encoders

Collecting category_encoders
  Downloading https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl (87kB)
Collecting scikit-learn>=0.20.0 (from category_encoders)
  Using cached https://files.pythonhosted.org/packages/76/79/60050330fe57fb59f2c53d0d11673df28c20ea9315da3652477429fc4949/scikit_learn-0.21.3-cp36-cp36m-win_amd64.whl
Collecting joblib>=0.11 (from scikit-learn>=0.20.0->category_encoders)
  Using cached https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl
Installing collected packages: joblib, scikit-learn, category-encoders


Exception:
Traceback (most recent call last):
  File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pip\basecommand.py", line 215, in main
    status = self.run(options, args)
  File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pip\commands\install.py", line 342, in run
    prefix=options.prefix_path,
  File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pip\req\req_set.py", line 784, in install
    **kwargs
  File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pip\req\req_install.py", line 851, in install
    self.move_wheel_files(self.source_dir, root=root, prefix=prefix)
  File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pip\req\req_install.py", line 1064, in move_wheel_files
    isolated=self.isolated,
  File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
# from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRFRegressor
import category_encoders as ce

pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    RandomForestRegressor(n_estimators=1000, random_state=42, n_jobs=-1)
    # XGBRFRegressor(n_estimators=1000, random_state=42)
)

ModuleNotFoundError: No module named 'category_encoders'

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

baseline_guess = y_val.mean() * np.ones(len(y_val))
print(mean_absolute_error(y_val, baseline_guess))

y_val_hat = pipeline.predict(X_val)
print(mean_absolute_error(y_val, y_val_hat))


In [None]:
print(r2_score(y_val, y_val_hat))

In [None]:
from joblib import dump
dump(pipeline, 'pipeline.joblib')

In [None]:
# regressor = pipeline.named_steps('randomforestregressor')

# regressor

In [None]:
import plotly.express as px

X = df[['Release_date', 'Max_resolution']]
y = df['Price']

plot_pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    RandomForestRegressor(n_estimators=1000, random_state=42, n_jobs=-1)
)

plot_pipeline.fit(X,y)

In [None]:
dump(plot_pipeline, 'plot_pipeline.joblib')