In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv("data/hack_ru.csv",
                   index_col=0,
                   dtype={
                       'country': 'category',
                       'company': 'category',
                       'total_pack_size_ml_g': 'float',
                       'unit_pack_size_ml_g': 'float',
                       'price_per_100g_ml_dollars': 'float',
                       'ingredients': 'object'
                   },
                   skip_blank_lines=False
                   )
data.dropna(inplace=True)
data['no_of_ingredients'] = data['ingredients'].apply(lambda x : len(x.split(', ')))
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14741 entries, 0 to 14846
Data columns (total 7 columns):
country                      14741 non-null category
company                      14741 non-null category
total_pack_size_ml_g         14741 non-null float64
unit_pack_size_ml_g          14741 non-null float64
price_per_100g_ml_dollars    14741 non-null float64
ingredients                  14741 non-null object
no_of_ingredients            14741 non-null int64
dtypes: category(2), float64(3), int64(1), object(1)
memory usage: 832.3+ KB


In [3]:
vectorizer = TfidfVectorizer(max_features=9000)
ingredients_vector = pd.DataFrame(vectorizer.fit_transform(data['ingredients']).toarray())
ingredients_vector.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14741 entries, 0 to 14740
Columns: 4363 entries, 0 to 4362
dtypes: float64(4363)
memory usage: 490.7 MB


In [4]:
data = pd.concat([data,
                  ingredients_vector],
                 axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14847 entries, 0 to 14846
Columns: 4370 entries, country to 4362
dtypes: category(2), float64(4367), object(1)
memory usage: 495.0+ MB


In [5]:
X = data.copy(deep=True)
X.shape

(14847, 4370)

In [6]:
X.drop(['ingredients', 'country', 'company'], axis=1, inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14847 entries, 0 to 14846
Columns: 4367 entries, total_pack_size_ml_g to 4362
dtypes: float64(4367)
memory usage: 494.8 MB


In [7]:
if np.any(pd.isnull(X)):
    X.dropna(inplace=True)
X.shape

(14635, 4367)

In [8]:
Y = X.pop('price_per_100g_ml_dollars')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [10]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(11708, 4366) (11708,)
(2927, 4366) (2927,)


In [11]:
X = data.copy(deep=True)
X.shape

(14847, 4370)

In [12]:
X = pd.get_dummies(X,
                   drop_first=True,
                   columns=['country', 'company'],
                   )
X.drop(['ingredients'], axis=1, inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14847 entries, 0 to 14846
Columns: 6367 entries, total_pack_size_ml_g to company_Brand 999
dtypes: float64(4367), uint8(2000)
memory usage: 523.1 MB


In [13]:
if np.any(np.isnan(X)):
    X.dropna(inplace=True)
X.shape

(14635, 6367)

In [14]:
Y = X.pop('price_per_100g_ml_dollars')

In [15]:
pca = PCA(n_components=1000)
X_pca = pd.DataFrame(pca.fit_transform(X))

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)

In [17]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(11708, 1000) (11708,)
(2927, 1000) (2927,)


Support Vector Regressor (total_pack_size_ml_g, unit_pack_size_ml_g, no_of_ingredients, one_hot_encoded_country, one_hot_encoded_company)

In [18]:
svr = SVR(gamma='scale', C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
mean_squared_error(y_test, y_pred)

44.81758439203953

Linear Support Vector Regressor (total_pack_size_ml_g, unit_pack_size_ml_g, no_of_ingredients, one_hot_encoded_country, one_hot_encoded_company)

In [19]:
lsvr = LinearSVR(random_state=0, tol=1e-5, loss='squared_epsilon_insensitive')
lsvr.fit(X_train, y_train)
y_pred = lsvr.predict(X_test)
mean_squared_error(y_test, y_pred)



53.95966636278959

Linear Regression (total_pack_size_ml_g, unit_pack_size_ml_g, no_of_ingredients, one_hot_encoded_country, one_hot_encoded_company)

In [20]:
lreg = LinearRegression()
lreg.fit(X_train, y_train)
y_pred = lreg.predict(X_test)
mean_squared_error(y_test, y_pred)

40.641970125807916

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Random Forest Regressor


In [22]:
rfregr = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=1000, n_jobs=-1)
rfregr.fit(X_train, y_train)
y_pred = rfregr.predict(X_test)
mean_squared_error(y_test, y_pred)



36.39143174297115

In [23]:
print(rfregr.feature_importances_)


[0.80081524 0.08521055 0.0009534  ... 0.         0.         0.        ]
