In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
COLORS = sns.color_palette()
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
def plot_bar(df, xlabel, ylabel, title, color=COLORS[0], figsize=(20,10), rotation=45):
    plot = df.plot(kind='bar',color=color,figsize=figsize)
    plot.set_xlabel(xlabel,fontsize=11)
    plot.set_ylabel(ylabel,fontsize=11)
    plot.set_title(title,fontsize=13)
    plot.set_xticklabels(labels=df.index, rotation=rotation)

In [3]:
dtypes = {
    'UnitPrice' : np.float32,
    'CustomerID' : np.int32,
    'Quantity' : np.int32
}
retail = pd.read_csv('./OnlineRetailClean.csv', dtype= dtypes)
retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'], infer_datetime_format=True)
retail.head()

Unnamed: 0.1,Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CheckoutPrice
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [4]:
ranks=retail.groupby('CustomerID').sum()['CheckoutPrice'].sort_values( ascending=False)

In [5]:
ranks=ranks.reset_index()
member=ranks.shape[0]

In [6]:
total = ranks['CheckoutPrice'].sum()
print(total)
total/ 100

8911407.904


89114.07904

In [7]:
# Svip = 1, vip =2, Gold =3, silver =4, family= 5
def add_rank(x):
    if x >= total * 0.005:
        return 1
    elif x >= total * 0.008:
        return 2
    elif x >= total * 0.0001:
        return 3
    elif x >= total * 0.00001:
        return 4
    else: 
        return 5

ranks['rank'] = ranks['CheckoutPrice'].apply(add_rank)
ranks

Unnamed: 0,CustomerID,CheckoutPrice,rank
0,14646,280206.02,1
1,18102,259657.30,1
2,17450,194550.79,1
3,16446,168472.50,1
4,14911,143825.06,1
...,...,...,...
4333,16878,13.30,5
4334,17956,12.75,5
4335,16454,6.90,5
4336,14792,6.20,5


In [8]:
ranks['rank'].value_counts()

4    2420
3    1781
5     117
1      20
Name: rank, dtype: int64

In [9]:
ranks.to_csv('./OnlineTest.csv', index = False)

In [10]:
# plt.scatter( vip['CheckoutPrice'], vip['CustomerID'], label = "data")

# plt.legend(loc = "best")
# plt.xlabel('CheckoutPrice')
# # plt.xlim(left =0 ,right=10000)
# plt.ylabel('CustomerID')
# plt.show()

In [11]:
df = pd.read_csv('./OnlineTest.csv')
df

Unnamed: 0,CustomerID,CheckoutPrice,rank
0,14646,280206.02,1
1,18102,259657.30,1
2,17450,194550.79,1
3,16446,168472.50,1
4,14911,143825.06,1
...,...,...,...
4333,16878,13.30,5
4334,17956,12.75,5
4335,16454,6.90,5
4336,14792,6.20,5


In [12]:
minmax = MinMaxScaler()

In [13]:
xy = np.array(df.iloc[:, [1]], dtype=np.float32)
yx = np.array(df.iloc[:, [-1]], dtype=np.float32)

In [14]:
xy = minmax.fit_transform(xy)
yx = minmax.fit_transform(yx)

In [15]:
model = tf.global_variables_initializer()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(xy,yx,random_state=0)

In [17]:
X_train.shape

(3253, 1)

In [18]:
y_train.shape

(3253, 1)

In [47]:
# 플레이스 홀더를 설정합니다.
X = tf.placeholder(tf.float32, shape=[None, 1])
Y = tf.placeholder(tf.float32, shape=[None, 1])

In [48]:
# y = wX+b
W = tf.Variable(tf.random_normal([1, 1]), name="weight")
b = tf.Variable(tf.random_normal([1]), name="bias")

In [49]:
# 가설을 설정합니다. y = wX+b
# tensorflow 행렬곱 합수 matmul
hypothesis = tf.matmul(X, W) + b
# hypothesis = tf.nn.softmax(tf.matmul(X, W) + b)

In [50]:
# 비용 함수를 설정합니다. (예측값 - 실제값)제곱의 평균
# reduce_mean은 특정 차원을 제거하고 평균을 구한다.
cost = tf.reduce_mean(tf.square(hypothesis - Y))
correct_prediction = tf.equal(tf.argmax(hypothesis,1),tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
mae = tf.reduce_mean(tf.abs(hypothesis - Y))

In [51]:
# 최적화 함수를 설정합니다.
optimizer = tf.train.MomentumOptimizer(learning_rate=0.0005,momentum=0.9)
train = optimizer.minimize(mae)

In [52]:
# 세션을 생성합니다.
sess = tf.Session()

In [53]:
# 글로벌 변수를 초기화합니다.
sess.run(tf.global_variables_initializer())

In [54]:
# 학습을 수행합니다.
for step in range(100001):
    mae_, hypo_, _ = sess.run([mae, hypothesis, train], feed_dict={X: X_train, Y: y_train})
    if step % 1000 == 0:
        print("#", step, " 등급차이: ", mae_)
        print("- 등급: ", hypo_[0])
        
# 손실비용 대신 MAE(Mean absolute error) 사용해보기  

# 0  등급차이:  2.5945966
- 등급:  [-1.9004954]
# 1000  등급차이:  0.1184401
- 등급:  [0.79785824]
# 2000  등급차이:  0.118255615
- 등급:  [0.7964765]
# 3000  등급차이:  0.11807132
- 등급:  [0.7950955]
# 4000  등급차이:  0.11788721
- 등급:  [0.7937166]
# 5000  등급차이:  0.11770305
- 등급:  [0.7923376]
# 6000  등급차이:  0.117518954
- 등급:  [0.79095894]
# 7000  등급차이:  0.117334865
- 등급:  [0.7895795]
# 8000  등급차이:  0.117150724
- 등급:  [0.78820086]
# 9000  등급차이:  0.116966635
- 등급:  [0.7868221]
# 10000  등급차이:  0.11678256
- 등급:  [0.7854435]
# 11000  등급차이:  0.11659839
- 등급:  [0.7840642]
# 12000  등급차이:  0.116414346
- 등급:  [0.78268546]
# 13000  등급차이:  0.11623026
- 등급:  [0.78130656]
# 14000  등급차이:  0.116046146
- 등급:  [0.7799275]
# 15000  등급차이:  0.11586206
- 등급:  [0.7785486]
# 16000  등급차이:  0.115677975
- 등급:  [0.7771698]
# 17000  등급차이:  0.115493834
- 등급:  [0.77579117]
# 18000  등급차이:  0.115309745
- 등급:  [0.77441174]
# 19000  등급차이:  0.1151256
- 등급:  [0.7730332]
# 20000  등급차이:  0.11494154
- 등급:  [0.771654]
# 21000  등급차이:  0.1147574
- 등급:  

In [55]:
# 학습된 모델을 저장합니다.
saver = tf.train.Saver()
save_path = saver.save(sess, "./saved.cpkt")
print('학습된 모델을 저장했습니다.')

학습된 모델을 저장했습니다.
