In [16]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import jieba
import re
import psutil
import os
from joblib import dump,load
from sklearn.model_selection import train_test_split


### 加载数据

In [6]:
# 加载分词模型
vec = load('vectorizer.model')
# 加载训练数据
train = pd.read_csv('train1.csv')

In [12]:
train.head(3)

Unnamed: 0,ID,Age,Gender,Education,Query_List
0,22DD920316420BE2DF8D6EE651BA174B,1,1,4,柔和 双沟 女生 中财网 首页 财经 pan 周公 解梦 大全 查询 2345 曹云金 ...
1,43CC3AF5A8D6430A3B572337A889AFE4,2,1,3,广州 厨宝 烤箱 世情 薄 人情 恶 雨送 黄昏 花易落 晓 风干 泪痕 厦门 酒店用品 批...
2,E97654BFF5570E2CCD433EA6128EAC19,4,1,0,钻石 之泪 耳机 盘锦 沈阳 旅顺 公交 辽宁 阜新 车牌 baidu k715 k716 ...


### 查看内存情况

In [40]:
info = psutil.virtual_memory()
print (u'内存使用：',psutil.Process(os.getpid()).memory_info().rss)
print (u'剩余内存：',info.total-psutil.Process(os.getpid()).memory_info().rss)

内存使用： 1504088064
剩余内存： 6763470848


### 构建贝叶斯模型

In [14]:
from sklearn.naive_bayes import MultinomialNB
age_classifier = MultinomialNB()
gender_classifier = MultinomialNB()
education_classifier = MultinomialNB()


#### 性别

In [21]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Gender']>0].Query_List.tolist(),\
                                                 train[train['Gender']>0].Gender.tolist())
# 训练
gender_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_g = gender_classifier.score(vec.transform(X_test), y_test)
s_g

0.7906548933038999

####  年龄

In [23]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Age']>0].Query_List.tolist(),\
                                                 train[train['Age']>0].Age.tolist())
# 训练
age_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_a = age_classifier.score(vec.transform(X_test), y_test)
s_a

0.5043524243410348

#### 学历

In [24]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Education']>0].Query_List.tolist(),\
                                                 train[train['Education']>0].Education.tolist())
# 训练
education_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_e = education_classifier.score(vec.transform(X_test), y_test)
s_e

0.4829805996472663

In [27]:
np.mean([s_g,s_a,s_e])

0.5926626390974004

In [38]:
### 保存模型
path = 'NB.model'
dump((gender_classifier,age_classifier,education_classifier), path)

['NB.model']

## 逻辑回归

In [31]:
from sklearn.linear_model import LogisticRegression
age_classifier = LogisticRegression(C=1000.0, random_state=0)
gender_classifier = LogisticRegression(C=1000.0, random_state=0)
education_classifier = LogisticRegression(C=1000.0, random_state=0)

#### 性别

In [32]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Gender']>0].Query_List.tolist(),\
                                                 train[train['Gender']>0].Gender.tolist())
# 训练
gender_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_g = gender_classifier.score(vec.transform(X_test), y_test)
s_g

0.7922083231134004

####  年龄

In [33]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Age']>0].Query_List.tolist(),\
                                                 train[train['Age']>0].Age.tolist())
# 训练
age_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_a = age_classifier.score(vec.transform(X_test), y_test)
s_a

0.5335584119752684

#### 学历

In [34]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Education']>0].Query_List.tolist(),\
                                                 train[train['Education']>0].Education.tolist())
# 训练
education_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_e = education_classifier.score(vec.transform(X_test), y_test)
s_e

0.5675925925925925

In [35]:
np.mean([s_g,s_a,s_e])

0.6311197758937538

In [37]:
### 保存模型
path = 'LC.model'
dump((gender_classifier,age_classifier,education_classifier), path)

['LC.model']

## 支持向量机

In [42]:
from sklearn.svm import SVC
age_classifier = SVC(kernel='linear', C=1.0, random_state=0)
gender_classifier = SVC(kernel='linear', C=1.0, random_state=0)
education_classifier = SVC(kernel='linear', C=1.0, random_state=0)

#### 性别

In [None]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Gender']>0].Query_List.tolist(),\
                                                 train[train['Gender']>0].Gender.tolist())
# 训练
gender_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_g = gender_classifier.score(vec.transform(X_test), y_test)
s_g

####  年龄

In [33]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Age']>0].Query_List.tolist(),\
                                                 train[train['Age']>0].Age.tolist())
# 训练
age_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_a = age_classifier.score(vec.transform(X_test), y_test)
s_a

0.5335584119752684

#### 学历

In [34]:
# 切分数据
X_train,X_test,y_train,y_test = train_test_split(train[train['Education']>0].Query_List.tolist(),\
                                                 train[train['Education']>0].Education.tolist())
# 训练
education_classifier.fit(vec.transform(X_train), y_train)
# 检测效果
s_e = education_classifier.score(vec.transform(X_test), y_test)
s_e

0.5675925925925925