In [1]:
import re
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import jieba
from collections import Counter,defaultdict
import math
import json

# 数据清洗

In [5]:
dir_path = "E:/code/jupyterNotebook/统计求职就业数据分析/test_data"
file_list = os.listdir(dir_path)
data_ok = []
add_words = ['大数据','机器学习','深度学习']
for word in add_words:
    jieba.add_word(word)
#字符串清洗
def clean_string(x):
    pattern1 = r'[，。；、 . - ·]'
    x = re.sub(pattern1,' ',x)
    x = re.sub('[^\u4e00-\u9fa5 ^A-Z ^a-z]+', ' ', x)
    return x
#分词并剔除停用词
def cut(x):
    x = jieba.cut(x,cut_all=False)
    x = " ".join(x)
    result = []
    stop_words = []
    with open('./stopwords.txt',encoding='utf-8') as f:
        temp = f.readlines()
        for word in temp:
            stop_words.append(word.strip())        
    for word in x.split(' '):
        word = word.strip().lower()
        if len(word)>1 and word not in stop_words :
            result.append(word)
    return result
#导入数据
for file_path in file_list:
    data = pd.read_csv(os.path.join(dir_path,file_path),encoding='gbk')
    data_ok.append(data)
data = pd.concat(data_ok)
#data.drop('Unnamed: 11',inplace=True,axis=1)
data = data.drop_duplicates(['name','salary','experience','edu_bg','company_name'])
data = DataFrame(data.values,columns = data.columns)
#data = data[data['adapt']==1]
string_feature = data[['responsibility','demand']].apply(lambda x:x.sum(),axis=1)
string_feature = string_feature.map(clean_string)
data['string_feature'] = string_feature.map(cut)


# 抽取tf-idf词并组合形成关键词

In [6]:
##实现TF-IDF
#统计所有词的IDF
##实现所有词的词频统计
def tf(words):
    outputwords = dict(Counter(words))
    return outputwords
def idf(text_list):#输入多个文本的分词列表
    words_count = defaultdict(int)
    idf = defaultdict(float)
    sample_num = len(text_list)
    for words in text_list:
        words = set(words)
        for word in words:
            words_count[word] +=1
    #计算IDF
    for word,count in words_count.items():
        idf[word] = math.log(sample_num/count+1)
    return idf
def tf_idf(tf,idf):
    tf = Series(tf)
    idf = Series(idf)
    tf_idf = tf*idf
    return tf_idf
words =np.hstack(data['string_feature'].values)
outputwords = tf(words)#输入分词列表
idf_ = idf(data['string_feature'])
tf_idf_= tf_idf(outputwords,idf_)
#print(tf_idf_.sort_values(ascending=False)[:20])
print(len(idf_))

7446


In [350]:
words_count=defaultdict(int)
string_feature_grouped_by_key = data[['key','string_feature',]].groupby('key')
for name,group in string_feature_grouped_by_key:
    count = tf(np.hstack(group['string_feature']))
    for key,value in count.items():
        words_count[key] += value/group.shape[0]

In [351]:

idf_ = idf(data['string_feature'])
tf_idf_= tf_idf(words_count,idf_)

In [424]:
#从tf-idf前500个词中抽出合适的词
#进行外连接，并筛选出100个合适的组合词
words = pd.read_table('拓展关键词.txt')
words_merge = np.array([0,0])
for first in words['数据'].values:
    for second in words['数据'].values:
        if first != second:
            words_merge = np.vstack([words_merge,np.array([first,second])])
words_merge = words_merge[1:]
words_merge = DataFrame(words_merge)

# 使用似然比检验搭建模型
$$p(x|y)=\frac {p(xy)} {p(y)}=\frac {\prod_{i=0}^n p(x_i y)} {p(y)}$$
使用比例法定义概率
$$p(x_i|y_0) = \frac{x_i \cap y_0} {n_0}$$
$$p(x_i|y_1) = \frac{x_i \cap y_1} {n_1}$$
其中 $x_i \cap y_1$是参数y=1且词$x_i$出现的样本数,n_1是y=1的样本数.
从而可以使用对数似然比进行分类，
$$ if  {\sum log((x_i \cap y_1)+1)} - { \sum log((x_i \cap y_0)+1)} \ge log \frac {n_1}{n_0},y=1$$
在实际使用中发现，将$log \frac {n_1}{n_0}$调整为超参数$\theta$，且修改上式，并使用交叉验证进行优化，分类效果更佳
$$ if  \frac {\sum log((x_i \cap y_1)+1)}{ \sum log((x_i \cap y_0)+1)} \ge 1+\theta,y=1$$

In [7]:
#生成词矩阵
outputwords_sorted = sorted(outputwords.items(), key= lambda x : x[1], reverse=True)
word_index = defaultdict()
for index,value in enumerate(outputwords_sorted):
    word_index[value[0]] = index
#稀疏矩阵
words_matrix = np.zeros((data.shape[0],len(word_index)))
for index,words in enumerate(data.loc[:,'string_feature']):
    for word in words:
        if word in word_index.keys():
            words_matrix[index,word_index[word]]=1
#X是词矩阵,Y=1表示合适，Y=0表示不合适
X_Y= np.hstack([words_matrix,data['adapt'].values.reshape(-1,1)])

In [8]:
X_Y1 = X_Y[X_Y[:,-1]==1]
X_Y0 = X_Y[X_Y[:,-1]==0]
X_1_sum = X_Y1[:,:-1].sum(axis=0)#xi and y=1的事件发生次数
X_0_sum = X_Y0[:,:-1].sum(axis=0)#xi and y=0的事件发生次数
print(X_Y1.shape,X_Y0.shape)

(659, 7447) (309, 7447)


In [9]:
class LikehoodModel:
    def train(self,X_Y):
        X_Y1 = X_Y[X_Y[:,-1]==1]
        X_Y0 = X_Y[X_Y[:,-1]==0]
        self.X_1_sum = X_Y1[:,:-1].sum(axis=0)#xi and y=1的事件发生次数
        self.X_0_sum = X_Y0[:,:-1].sum(axis=0)#xi and y=0的事件发生次数
        
    def generate_X(self,word_count,words_ser):#word_count是词频统计结果,words是所有监督数据的词列表Series
        word_count = sorted(word_count.items(), key= lambda x : x[1], reverse=True)
        word_index = defaultdict()
        for index,value in enumerate(word_count):
            word_index[value[0]] = index
        #稀疏矩阵
        words_matrix = np.zeros((words_ser.shape[0],len(word_index)))
        for index,words in enumerate(words_ser):
            for word in words:
                if word in word_index.keys():
                    words_matrix[index,word_index[word]]=1
        return words_matrix
    def predict(self,X):
        Y_hat = np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            loglikehood_1 = (np.log((self.X_1_sum[X[i,:]==1]+1).astype('float'))).sum()
            loglikehood_0 = (np.log((self.X_0_sum[X[i,:]==1]+1).astype('float'))).sum()
            
            if loglikehood_1/loglikehood_0>1.18:
                Y_hat[i,0] = 1
        return Y_hat


In [10]:
model = LikehoodModel()
model.train(X_Y)
y_hat = model.predict(X_Y[:,:-1])
contrast = np.hstack([y_hat,X_Y[:,-1].reshape(-1,1).astype('float')])
print(y_hat.sum())
print(len(y_hat))
print('accuracy score:{0:0.2f}'.format((contrast[:,0]==contrast[:,1]).sum()/contrast.shape[0]))

#计算
from sklearn.metrics import recall_score,precision_score
recall = recall_score(contrast[:,1].reshape(-1), contrast[:,0].reshape(-1))
precision = precision_score(contrast[:,1].reshape(-1), contrast[:,0].reshape(-1))
print('recall score: {0:0.2f}'.format(
      recall))
print('precision score: {0:0.2f}'.format(
      precision))

  if loglikehood_1/loglikehood_0>1.18:


720.0
968
accuracy score:0.91
recall score: 0.98
precision score: 0.90


In [11]:
#交叉验证
from sklearn.model_selection import KFold
kf = KFold(n_splits=10,shuffle=True,random_state=20211208)
scores = np.zeros([3,10])
i = 0
for train_index, test_index in kf.split(X_Y):
    train_XY,test_XY = X_Y[train_index],X_Y[test_index]
    model = LikehoodModel()
    model.train(train_XY)
    y_hat = model.predict(test_XY[:,:-1])
    contrast = np.hstack([y_hat,test_XY[:,-1].reshape(-1,1).astype('float')])
    scores[0,i] = ((contrast[:,0]==contrast[:,1]).sum()/contrast.shape[0])
    scores[1,i] = recall_score(contrast[:,1].reshape(-1), contrast[:,0].reshape(-1))
    scores[2,i] = precision_score(contrast[:,1].reshape(-1), contrast[:,0].reshape(-1))
    i += 1
mean_scores = scores.mean(axis=1).reshape(-1)
print('accuracy score:{0:0.2f},recall score: {1:0.2f},precision score: {2:0.2f}'.format(mean_scores[0],mean_scores[1],mean_scores[2]))

  if loglikehood_1/loglikehood_0>1.18:


accuracy score:0.81,recall score: 0.94,precision score: 0.81


In [12]:
print(scores)
print(mean_scores)

[[0.75257732 0.84536082 0.8556701  0.84536082 0.75257732 0.82474227
  0.73195876 0.78350515 0.875      0.80208333]
 [0.82539683 0.90140845 0.93055556 0.96875    0.93220339 0.94285714
  0.95081967 0.98305085 0.97183099 1.        ]
 [0.8        0.88888889 0.88157895 0.82666667 0.73333333 0.83544304
  0.71604938 0.74358974 0.87341772 0.78409091]]
[0.80688359 0.94068729 0.80830586]


In [16]:
#导入待判断数据
# destination_file='data.json'
# with open(destination_file,'r') as f:
#     info_dict = json.load(f)
#     data_test = DataFrame(info_dict).T
data_test = data_test.drop_duplicates(['id'])
data_test = DataFrame(data_test.values,columns = data_test.columns)
#data = data[data['adapt']==1]
data_test = data_test#.iloc[:2000,:]
string_feature = data_test[['responsibility','demand']].apply(lambda x:x.sum(),axis=1)
string_feature = string_feature.map(str)
string_feature = string_feature.map(clean_string)
data_test['string_feature'] = string_feature.map(cut)


TypeError: predict() takes 2 positional arguments but 4 were given

In [19]:
words =np.hstack(data_test['string_feature'].values)
model = LikehoodModel()
model.train(X_Y)
X = model.generate_X(outputwords,data_test['string_feature'])
y_hat = model.predict(X)

print(y_hat.sum())
print(X.shape[0])

  if loglikehood_1/loglikehood_0>1.18:


20540.0
25131


In [21]:
#将结果存入csv
data_test['adapt']=y_hat
data_test.to_csv('work_data.csv')