In [1]:
import random
import fasttext
import psutil
import os
from joblib import dump,load
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import random

## 查看内存

In [2]:
info = psutil.virtual_memory()
print (u'内存使用：',psutil.Process(os.getpid()).memory_info().rss)
print (u'剩余内存：',info.total-psutil.Process(os.getpid()).memory_info().rss)

内存使用： 126476288
剩余内存： 8141082624


### 加载数据

In [12]:
train = pd.read_csv('train1.csv',index_col=0)
train.head(3)

Unnamed: 0,ID,Age,Gender,Education,Query_List
0,22DD920316420BE2DF8D6EE651BA174B,1,1,4,柔和 双沟 女生 中财网 首页 财经 pan 周公 解梦 大全 查询 2345 曹云金 ...
1,43CC3AF5A8D6430A3B572337A889AFE4,2,1,3,广州 厨宝 烤箱 世情 薄 人情 恶 雨送 黄昏 花易落 晓 风干 泪痕 厦门 酒店用品 批...
2,E97654BFF5570E2CCD433EA6128EAC19,4,1,0,钻石 之泪 耳机 盘锦 沈阳 旅顺 公交 辽宁 阜新 车牌 baidu k715 k716 ...


### 构造格式化文本  
需要将文本转化为：```__label__2 , birchas chaim yeshiva birchas chaim ...```

In [47]:
def preprocess_text0(content_lines, sentences, category):
    for line in content_lines:
        sentences.append("__label__"+str(category)+" , "+ line)    
def preprocess_text(name,data):
    sentences = []
    for i in set(data.loc[:,name].values):
        preprocess_text0(data[data[name]==i].loc[:,'Query_List'].tolist(), sentences, i)
    random.shuffle(sentences)
    return sentences

In [33]:
def save_data(path,sentences):
    # 将数据保存到txt中
    print("writing data to fasttext format...")
    out = open(path, 'w', encoding='utf-8')
    for sentence in sentences:
        out.write(sentence+"\n")
    print("done!")

In [52]:
# 转化年龄数据
train_set, test_set = train_test_split(train[train['Age']>0],\
                                       stratify=train[train['Age']>0].Age,\
                                       random_state=42)
#训练数据
sentences = preprocess_text('Age',train_set)
# 写txt文件
path = 'age_train.txt'
save_data(path,sentences)
# 测试数据
sentences = preprocess_text('Age',test_set)
# 写txt文件
path = 'age_test.txt'
save_data(path,sentences)

writing data to fasttext format...
done!
writing data to fasttext format...
done!


In [53]:
# 转化性别数据
train_set, test_set = train_test_split(train[train['Gender']>0],\
                                       stratify=train[train['Gender']>0].Gender,\
                                       random_state=42)
#训练数据
sentences = preprocess_text('Gender',train_set)
# 写txt文件
path = 'gender_train.txt'
save_data(path,sentences)
# 测试数据
sentences = preprocess_text('Gender',test_set)
# 写txt文件
path = 'gender_test.txt'
save_data(path,sentences)

writing data to fasttext format...
done!
writing data to fasttext format...
done!


In [54]:
# 转化教育程度数据
train_set, test_set = train_test_split(train[train['Education']>0],\
                                       stratify=train[train['Education']>0].Education,\
                                       random_state=42)
#训练数据
sentences = preprocess_text('Education',train_set)
# 写txt文件
path = 'education_train.txt'
save_data(path,sentences)
# 测试数据
sentences = preprocess_text('Education',test_set)
# 写txt文件
path = 'education_test.txt'
save_data(path,sentences)

writing data to fasttext format...
done!
writing data to fasttext format...
done!


## 模型训练

In [2]:
# 年龄
age_classifier = fasttext.train_supervised(input='age_train.txt', dim=100, epoch=10,
                                         lr=0.1, wordNgrams=2, loss='softmax')
age_classifier.save_model('age_classifier.model')

In [3]:
# 性别
gender_classifier = fasttext.train_supervised(input='gender_train.txt', dim=100, epoch=10,
                                         lr=0.1, wordNgrams=2, loss='softmax')
gender_classifier.save_model('gender_classifier.model')

In [5]:
# 教育程度
education_classifier = fasttext.train_supervised(input='Education_train.txt', dim=100, epoch=10,
                                         lr=0.1, wordNgrams=2, loss='softmax')
education_classifier.save_model('education_classifier.model')

In [7]:
#模型检验
s_a = age_classifier.test('age_test.txt')
print('age_P@1:', s_a[1])
print('age_R@1:', s_a[2])
print('Number of age_examples:', s_a[0])
s_g = gender_classifier.test('gender_test.txt')
print('gender_P@1:', s_g[1])
print('gender_R@1:', s_g[2])
print('Number of gender_examples:', s_g[0])
s_e = education_classifier.test('education_test.txt')
print('education_P@1:', s_e[1])
print('education_R@1:', s_e[2])
print('Number of education_examples:', s_e[0])

age_P@1: 0.5748454279205988
age_R@1: 0.5748454279205988
Number of age_examples: 24584
gender_P@1: 0.8038999264164827
gender_R@1: 0.8038999264164827
Number of gender_examples: 24462
education_P@1: 0.6089506172839506
education_R@1: 0.6089506172839506
Number of education_examples: 22680


In [10]:
np.mean([s_g[1],s_a[1],s_e[1]])

0.6625653238736774