In [3]:
# 导包
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [4]:
df = pd.read_csv('./data/orderHistory_train.csv')
print(df.head(5))

         userid  orderid   orderTime  orderType city country continent
0  100000000013  1000015  1481714516          0   柏林      德国        欧洲
1  100000000013  1000014  1501959643          0  旧金山      美国       北美洲
2  100000000393  1000033  1499440296          0   巴黎      法国        欧洲
3  100000000459  1000036  1480601668          0   纽约      美国       北美洲
4  100000000459  1000034  1479146723          0  巴厘岛   印度尼西亚        亚洲


In [5]:
userAndType = df[['userid','orderType']].values

userIdSetWithQualityService = set()  # 有精品服务的客户
userIdSet = set()

for line in userAndType:
    if (line[1] == 1):
        userIdSetWithQualityService.add(line[0])
    userIdSet.add(line[0])

    
print('精品服务用户数： ' , len(userIdSetWithQualityService))
print('总用户数： ', len(userIdSet), ', 精品服务数占比： ', len(userIdSetWithQualityService) / len(userIdSet))

精品服务用户数：  1745
总用户数：  10637 , 精品服务数占比：  0.1640500141017204


In [6]:
# 预测订单
orderFutureTrain = pd.read_csv('./data/orderFuture_train.csv')
print(orderFutureTrain.head(5))

         userid  orderType
0  100000000013          0
1  100000000111          0
2  100000000127          0
3  100000000231          0
4  100000000379          0


In [7]:
# 历史交易中有精品交易的用户占比多少
userAndType = orderFutureTrain[['userid','orderType']].values

all = 0  # 总的future订单数
count = 0 # 买过精品的人再买精品
countQualityService = 0 # 总精品的数量

for line in userAndType:
    user = line[0]
    type = line[1]
    if (type == 1): countQualityService += 1
    if (user in userIdSetWithQualityService and type == 1): count += 1
    all += 1
print(count)
print(countQualityService)

print(countQualityService / all) # 购买精品的占比也在16%左右
print(count / all) 

# 买过精品交易的人， 一定在future会买精品

1745
6625
0.1643635100602873
0.04329272831021907


In [11]:
# 参与精品服务的用户集合
userWithQ = set() 
for line in userAndType: 
    if line[1] == 1: userWithQ.add(line[0])
        
print('购买精品用户数： ', len(userWithQ))

# 用户ID 交易笔数 精品服务的占比 标签
def totalTransCount(fileName, userWithQ = set()):
    df = pd.read_csv(fileName)
    user = df[['userid', 'orderType']].values
    userSet = set()
    
    from collections import defaultdict
    # 用户id 购买精品服务的历史数量
    map1 = defaultdict(int)
    # 用户id 未购买精品服务的历史数量
    map2 = defaultdict(int)
    for u in user: 
        type = u[1]
        userSet.add(u[0])
        if type == 1:
            map1[u[0]] += 1
        else:
            map2[u[0]] += 1
    
    print(len(userSet))
    totals = []
    ratios = []
    labels = []
    userss = []
    for u in userSet:
        total = map1[u] + map2[u]
        ratio = map1[u] / total
        label = 0
        if (u in userWithQ): label = 1
        totals.append(total)
        labels.append(label)
        ratios.append(ratio)
        userss.append(str(u))
        
    output = pd.DataFrame({
        "userid" : userss,
        "total" : totals,
        "ratio" : ratios,
        "label" : labels
    }, columns = ['userid', 'total', 'ratio', 'label'])
    return output
    
    
train = totalTransCount('./data/orderHistory_train.csv', userWithQ)
print(train.head(5))

train.to_csv('./data/train.csv', index = False)

购买精品用户数：  6625
10637
         userid  total  ratio  label
0  114740494336      3    0.0      1
1  114803867648      1    0.0      0
2  111840919554      2    0.0      0
3  110305771527      1    1.0      1
4  110834417672      1    0.0      0


In [9]:
# 模型训练
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

examples = train[['total', 'ratio']]
labels   = train[['label']]

cls = RandomForestClassifier(n_estimators = 100)
cls.fit(examples, labels)

cls.score(examples, labels)



0.89348500517063079

In [12]:
# 模型预测
test = totalTransCount('./data/test/orderHistory_test.csv')
test_data = test[['userid', 'total', 'ratio']].values
test_userMap = {}

for instance in test_data:
    test_userMap[instance[0]] = (instance[1], instance[2])

orderFutureTest = pd.read_csv('./data/test/orderFuture_test.csv')
test_user2 = orderFutureTest['userid'].values

userss = []
totals = []
ratios = []

for u in test_user2:
    userss.append(str(u))
    if (str(u) in test_userMap):
        value = test_userMap[str(u)]
        totals.append(value[0])
        ratios.append(value[1])
    else:
        totals.append(0)
        ratios.append(0)
        
test_data_set = pd.DataFrame({
        "userid" : userss,
        "total" : totals,
        "ratio" : ratios
})

test_data_set.to_csv('./data/test/test.csv', index = False, columns = ['userid', 'total', 'ratio'])


x_test = test_data_set[['total', 'ratio']]
predicts  = cls.predict(x_test)

submission = pd.DataFrame({
    "orderType" : predicts,
    "userid" : test_data_set["userid"]
}, columns = ['userid', 'orderType'])

submission.to_csv('test.csv',index = False)


2686
