In [1]:
import pandas as pd
import lightgbm as lgb
from datetime import datetime # 高级封装接口
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import accuracy_score
le=LabelEncoder()
train=pd.read_csv('data/data71852/train.csv')
test1=pd.read_csv('data/data71852/test1.csv')

features=train.drop(['Unnamed: 0','label'],axis=1)
labels=train['label']

# 不参与建模的特征['os','lan','sid']
# os  为数据集中默认版本
# sid 为唯一值
# lan 为默认数值
remove_list=['os','lan','sid']
col=features.columns.tolist()
for i in remove_list:
    col.remove(i)




# 特征变换,对于数值过大的异常值 设置为0
features['fea_hash']=features['fea_hash'].map(lambda x:0 if len(str(x))>16 else int(x))
features['fea1_hash']=features['fea1_hash'].map(lambda x:0 if len(str(x))>16 else int(x))
# 针对version,非数值类型 设置0
features['version']=features['version'].map(lambda x:int(x) if str(x).isdigit() else 0)

# 特征筛选
features=features[col]

# 数据探索，找到导致1的关键特征值
def find_key_feature(train,selected):
    temp0=train[train['label']==0]
    temp=pd.DataFrame(columns=[0,1])
    temp[0]=temp0[selected].value_counts()/len(temp0)*100
    temp1=train[train['label']==1]
    temp[1]=temp1[selected].value_counts()/len(temp1)*100
    temp[2]=temp[1]/temp[0]
    # 选出大于10倍的特征
    result=temp[temp[2]>10].sort_values(2,ascending=False).index
    return result
key_feature={}

selected_cols=['osv', 'apptype', 'carrier', 'dev_height', 'dev_ppi','dev_width',  'media_id', 'ntt',  'package','version', 'fea_hash', 'location', 'fea1_hash','cus_type']
for selected in selected_cols:
    key_feature[selected]=find_key_feature(train,selected)

# 构造新特征，新特征字段=原始特征字段+1
def f(x,selected):
    # 判断是否在关键特征值里，是1，否0
    if x in key_feature[selected]:
        return 1
    else:
        return 0

for selected in selected_cols:
    if len(key_feature[selected])>0:
        features[selected+'1']=features[selected].apply(f,args=(selected, ))
        test1[selected+'1']=test1[selected].apply(f,args=(selected, ))

# 确定类别特征
cate_features=['apptype','carrier','ntt','version','location','cus_type']


# 增加TimeStamp
def get_date(features):
    features['timestamp']=features['timestamp'].apply(lambda x:datetime.fromtimestamp(x/1000))
    temp=pd.DatetimeIndex(features['timestamp'])
    features['year']=temp.year
    features['month']=temp.month
    features['day']=temp.day
    features['week_day']=temp.weekday
    features['hour']=temp.hour
    features['minute']=temp.minute

    # 添加time_diff
    start_time=features['timestamp'].min()
    features['time_diff']=features['timestamp']-start_time
    features['time_diff']=features['time_diff'].dt.days*24+features['time_diff'].dt.seconds/3600
    # 使用day,time_diff
    features.drop(['timestamp','year','month','week_day','minute'],axis=1,inplace=True)
    return features

# 对训练集提取时间多尺度
features=get_date(features)
# 对测试集提取时间多尺度
test1=get_date(test1)


# 需要将训练集和测试集合并，然后统一做LabelEncoder
all_df=pd.concat([train,test1])
all_df['osv']=all_df['osv'].astype('str')
all_df['osv']=le.fit_transform(all_df['osv'])
features['osv']=all_df[all_df['label'].notnull()]['osv']


# 采用交叉验证 ensemble model
def ensemble_model(clf,train_x,train_y,test,cate_features):
    num=10
    sk=StratifiedKFold(n_splits=num,shuffle=True,random_state=2021)
    prob=[]   # 记录最终结果
    mean_acc=0   #记录平均准确率
    for k,(train_index,val_index) in enumerate(sk.split(train_x,train_y)):
        train_x_real=train_x.iloc[train_index]
        train_y_real=train_y.iloc[train_index]
        val_x=train_x.iloc[val_index]
        val_y=train_y.iloc[val_index]
        # 子模型训练
        clf=clf.fit(train_x_real,train_y_real,categorical_feature=cate_features)
        val_y_pred=clf.predict(val_x)
        acc_val=accuracy_score(val_y,val_y_pred)
        # 子模型评估
        print('第{}个子模型acc:{}'.format(k+1,acc_val))
        mean_acc+=acc_val/num
        # 子模型预测0，1
        test_y_pred=clf.predict_proba(test)[:,-1]  # soft 得到概率值
        prob.append(test_y_pred)
    print(mean_acc)
    mean_prob=sum(prob)/num
    return mean_prob

# 测试集预测，保持与features中的columns一致
test_features=test1[features.columns]

# 特征变换,对于数值过大的异常值 设置为0
test_features['fea_hash']=test_features['fea_hash'].map(lambda x:0 if len(str(x))>16 else int(x))
test_features['fea1_hash']=test_features['fea1_hash'].map(lambda x:0 if len(str(x))>16 else int(x))
# 对数据清洗，将V3=>3,V1=>1,V6=>6,V2=>2
# 针对version,非数值类型 设置0
test_features['version']=test_features['version'].map(lambda x:int(x) if str(x).isdigit() else 0)
test_features['osv']=all_df[all_df['label'].isnull()]['osv']


# 使用LightGBM训练
clf=lgb.LGBMClassifier(
    num_leaves=2**7-1,
    reg_alpha=0.5,
    reg_lambda=0.5,
    objective='binary',
    max_depth=-1,
    learning_rate=0.005,
    min_child_samples=3,
    random_state=2021,
    n_estimators=10000,
    subsample=0.5,
    colsample_bytree=0.5,
)
result=ensemble_model(clf,features,labels,test_features,cate_features)

# 保存结果
a=pd.DataFrame(test1['sid'])
a['label']=result
# 转换为二分类
a['label']=a['label'].apply(lambda x:0 if x<0.9 else 1)
a.to_csv('baseline.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




第1个子模型acc:0.89048




第2个子模型acc:0.88896




第3个子模型acc:0.88898




第4个子模型acc:0.88968




第5个子模型acc:0.8874




第6个子模型acc:0.88978




第7个子模型acc:0.89156




第8个子模型acc:0.88802




第9个子模型acc:0.8881




第10个子模型acc:0.89038


0.8893340000000001
