## 依赖的第三方库

In [14]:
# 读取数据
import pandas as pd
import numpy as np
# import seaborn as sns
import codecs
# 建模
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

## 数据处理

### 重新实现的数据读取函数

In [15]:
def robust_readcsv(path, sep=','):
    try:
        lines = codecs.open(path).readlines()
    except:
        lines = codecs.open(path, encoding='latin-1').readlines()
    header = lines[0].strip().split(sep)
    content = []
    for line in lines[1:]:
        line = line.strip()
        try:
            index = [i for i, x in enumerate(line) if x == ',']
            if len(index) == len(header) - 1:
                content.append(line.split(sep))
            else:
                line_content = []
                index = [0] + index
                for idx in range(len(header)-1):
                    line_content.append(line[index[idx]:index[idx+1]].strip(sep))
                line_content.append(line[index[len(header)-1]:].strip(sep))
                content.append(line_content)
        except:
            pass
    return pd.DataFrame(content, columns=header)

### 读取了全部数据集中的数据并存入了DataFrame表格中

In [16]:
train_cus = pd.read_csv('智能家居使用场景识别挑战赛数据集/训练集/cus.csv', sep=',')
test_cus = pd.read_csv('智能家居使用场景识别挑战赛数据集/测试集/cus.csv', sep=',')
ans_cus = pd.read_csv('智能家居使用场景识别挑战赛数据集/测试集/cus.csv', sep=',')

train_devupdate = robust_readcsv('智能家居使用场景识别挑战赛数据集/训练集/devUpdata.csv', sep=',')
test_devupdate = robust_readcsv('智能家居使用场景识别挑战赛数据集/测试集/devUpdata.csv', sep=',')

train_control = robust_readcsv('智能家居使用场景识别挑战赛数据集/训练集/control.csv', sep=',')
test_control = robust_readcsv('智能家居使用场景识别挑战赛数据集/测试集/control.csv', sep=',')

train_devlist = robust_readcsv('智能家居使用场景识别挑战赛数据集/训练集/devList.csv', sep=',')
test_devlist = robust_readcsv('智能家居使用场景识别挑战赛数据集/测试集/devList.csv', sep=',')

### 数据分析
将训练数据和预测数据的各个表中的数据按照 'uid' 来分组 获得特征，并重置索引 index，重新设置列名

In [17]:
train_devupdate_feat = train_devupdate.groupby('uid').agg({
    'did': 'nunique',
    'data': 'nunique',
})
train_devupdate_feat.reset_index(inplace=True)
train_devupdate_feat.columns = ['uid', 'devupdate_did_count', 'devupdate_data_count']


test_devupdate_feat = test_devupdate.groupby('uid').agg({
    'did': 'nunique',
    'data': 'nunique',
})
test_devupdate_feat.reset_index(inplace=True)
test_devupdate_feat.columns = ['uid', 'devupdate_did_count', 'devupdate_data_count']


train_control_feat = train_control.groupby('uid').agg({
    'did': 'nunique',
    'form': 'nunique',
    'data': 'nunique',
})
train_control_feat.reset_index(inplace=True)
train_control_feat.columns = ['uid', 'devcontrol_did_count', 
                              'devcontrol_form_count', 'devcontrol_data_count']

test_control_feat = test_control.groupby('uid').agg({
    'did': 'nunique',
    'form': 'nunique',
    'data': 'nunique',
})
test_control_feat.reset_index(inplace=True)
test_control_feat.columns = ['uid', 'devcontrol_did_count', 
                              'devcontrol_form_count', 'devcontrol_data_count']



train_devlist_feat = train_devlist.groupby('uid').agg({
    'did': 'nunique',
    'type': 'nunique',
    'area': ['unique', 'nunique', 'count']
})
train_devlist_feat.reset_index(inplace=True)
train_devlist_feat.columns = [x[0] + x[1] for x in train_devlist_feat.columns]

test_devlist_feat = test_devlist.groupby('uid').agg({
    'did': 'nunique',
    'type': 'nunique',
    'area': ['unique', 'nunique', 'count']
})
test_devlist_feat.reset_index(inplace=True)
test_devlist_feat.columns = [x[0] + x[1] for x in test_devlist_feat.columns]

### 拼接数据
拼接上面得到的数据，按照 'uid' 进行整合, 即 同一个 uid 的数据才会合并到一起，放置到同一张DataFrame表格中，形成feature表

In [18]:

train_feat = train_cus.merge(train_devlist_feat, on='uid')
train_feat = train_feat.merge(train_control_feat, on='uid', how='left')
train_feat = train_feat.merge(train_devupdate_feat, on='uid', how='left')
train_feat.fillna(0, inplace=True)


test_feat = test_cus.merge(test_devlist_feat, on='uid')
test_feat = test_feat.merge(test_control_feat, on='uid', how='left')
test_feat = test_feat.merge(test_devupdate_feat, on='uid', how='left')
test_feat.fillna(0, inplace=True)


### 对字符串数据'areaunique'进行的特殊处理
由于我们得到的area所有的取值：'areaunique'格式为一个string的列表，这个格式无法装载进入表格中进行处理，所以我们要对该项数据进行处理，我的处理方法是统计每个‘uid’的‘area’的每个token的字符的数量和。

In [19]:
tfidf = TfidfVectorizer(max_features=400)
train_dev_tfidf = tfidf.fit_transform(train_feat['areaunique'].apply(lambda x: ' '.join(x)))
test_dev_tfidf = tfidf.transform(test_feat['areaunique'].apply(lambda x: ' '.join(x)))

将上述得到的 ndarray 做成 DataFrame 的格式

In [20]:
train_dev_tfidf = pd.DataFrame(train_dev_tfidf.toarray(), columns=tfidf.get_feature_names_out())
test_dev_tfidf = pd.DataFrame(test_dev_tfidf.toarray(), columns=tfidf.get_feature_names_out())

同样，将文本数据拼接到feature表中，形成一张包含所有数据的表

In [21]:
train_feat = pd.concat([train_dev_tfidf, train_feat], axis=1)
test_feat = pd.concat([test_dev_tfidf, test_feat], axis=1)

### 算法模型
主要训练了决策树相关模型对预测数据进行分类。
运用了三个模型算法：随机森林，XGBoost以及LightGBM。

#### 随机森林模型

In [22]:
clf = RandomForestClassifier(n_estimators=2500, class_weight={0:0.89, 1:0.11}, bootstrap=False)
clf.fit(train_feat.drop(['uid', 'label', 'areaunique'], axis=1), train_feat['label'])
test_cus['label'] = clf.predict(test_feat.drop(['uid','areaunique'], axis=1),)

#### XGBoost模型

In [23]:
clf = XGBClassifier(n_estimators=2000)
clf.fit(train_feat.drop(['uid', 'label', 'areaunique'], axis=1), train_feat['label'])
test_cus['label_xgb'] = clf.predict(test_feat.drop(['uid','areaunique'], axis=1),)

#### LightGBM模型

In [24]:
clf = LGBMClassifier(n_estimators=2500,class_weight={0:0.89, 1:0.11})
clf.fit(train_feat.drop(['uid', 'label', 'areaunique'], axis=1), train_feat['label'])
test_cus['label_lgbm'] = clf.predict(test_feat.drop(['uid','areaunique'], axis=1),)

### 模型融合
利用Linear Blending融合方式对模型进行线性加权融合，取上述三个模型的分类结果并赋予权重，形成最终的预测结果。

In [25]:
ans=np.zeros(267,dtype=int)
print(test_cus)
for i in range(0,267):
    row_labels = test_cus.index[i]
    tmp=1*int(test_cus.at[row_labels,'label'])+1.4*int(test_cus.at[row_labels,'label_lgbm'])+0.6*int(test_cus.at[row_labels,'label_xgb'])
    if(tmp>=0.6):
        ans[i]=1
    else:
        ans[i]=0
ans_cus['label']=ans

                                  uid  label  label_xgb  label_lgbm
0    002026a27f38d9a3203eef2b8b7142fc      0          0           0
1    017b9b51c033453de17d8aeab2de99c1      0          0           0
2    030be6e9263ecfaca79598bd411eafd3      0          0           0
3    0382f49c67a49252fd7baf7699cb2a6a      0          0           0
4    050581a0e3a1593626c533a9e2f1577d      0          0           1
..                                ...    ...        ...         ...
262  f7f0d2d16e25cdfb09cac75d320c057d      0          0           0
263  fa23bd332abee9a88e2d9a012083d50b      0          0           0
264  fc3a5fcf2c41ac7e702a72a60c8c7deb      0          0           0
265  fdfd6981bee55bb7ad89156e3e558a2c      0          0           0
266  fed0db6a42c31cf9727ee372d0198921      0          0           0

[267 rows x 4 columns]


#### 将预测结果存储到csv格式文件中，形成结果文件

In [26]:
ans_cus.to_csv('submit_file.csv', index=None)