/
main_day3.py
71 lines (55 loc) · 2.85 KB
/
main_day3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
train = pd.read_csv('train_tmp.csv').fillna(0)
test = pd.read_csv('test.csv').fillna(0)
print(train.head())
print(train.axes)
x_train = train[['price', 'item_seq_number', 'image_top_1']]
x_test = test[['price', 'item_seq_number', 'image_top_1']]
y_train = train.deal_probability
y_test = test.deal_probability
lrFull = linear_model.LinearRegression()
lrFull.fit(x_train, y_train)
predictions_full = lrFull.predict(x_test)
# mean squared error on full dataset logistic regression
meanSquaredError = np.sum(np.square(y_test - predictions_full))/predictions_full.size
rootMeanSquaredError = math.sqrt(meanSquaredError)
print('Full dataset root mean squared error: ', rootMeanSquaredError)
groupableColumnLabels = ['region', 'city', 'parent_category_name', 'category_name', 'user_type']
groupModelsDict = {}
for groupableColumnLabel in groupableColumnLabels:
print('Building regression models ' + groupableColumnLabel)
# split our dataframe into a set of dataframes for each parent_category
train_grouping = train.groupby(groupableColumnLabel)
train_groups_dict = {}
[train_groups_dict.__setitem__(x,train_grouping.get_group(x)) for x in train_grouping.groups]
# build our set of regression models one for each parent_category_name
regression_models_dict = {}
for key, train_group in train_groups_dict.items():
lr = linear_model.LinearRegression()
lr.fit(train_group[['price', 'item_seq_number', 'image_top_1']], train_group.deal_probability)
regression_models_dict[key] = lr
groupModelsDict[groupableColumnLabel] = regression_models_dict
# iterate over all rows in our test data and build a new row of predictions, one for each category column
print('Making predictions')
predictions = []
for index, row in test.iterrows():
row_data = np.reshape([row['price'], row['item_seq_number'], row['image_top_1']], (1,-1))
rowPredictions = []
for groupableColumnLabel in groupableColumnLabels:
groupableColumnValue = row[groupableColumnLabel]
# if a model is missing for whatever reason pick up the full model to at least get a number
if groupableColumnValue in regression_models_dict:
prediction = groupModelsDict[groupableColumnLabel][groupableColumnValue].predict(row_data)
else:
prediction = lrFull.predict(row_data)
rowPredictions.append(min(max(0,prediction[0]),1))
predictions.append(rowPredictions)
meanPredictions = [sum(p)/5 for p in predictions]
# mean squared error on individual parent_category logistic regression
meanSquaredError = np.sum(np.square(y_test - meanPredictions))/len(meanPredictions)
rootMeanSquaredError = math.sqrt(meanSquaredError)
print('Mean predicition root mean squared error: ', rootMeanSquaredError)