In [1]:
'''
数据分析
'''
import pandas as pd


#读取训练和测试数据。
train_data = pd.read_csv('../Datasets/twitter/train.csv')
test_data = pd.read_csv('../Datasets/twitter/test.csv')

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [4]:
'''
数据预处理
'''
y_train = train_data['target']

In [5]:
#填充未知特征。
train_data = train_data.fillna('UNK')
test_data = test_data.fillna('UNK')

In [6]:
from sklearn.preprocessing import OneHotEncoder


#对类别型特征进行编码。
ohe = OneHotEncoder(handle_unknown='ignore')

loc_X_train = ohe.fit_transform(train_data[['location']].values).todense()
loc_X_test = ohe.transform(test_data[['location']].values).todense()

In [7]:
from sklearn.feature_extraction.text import CountVectorizer


#对文本型特征进行编码。
vec = CountVectorizer(lowercase=True, stop_words='english')

kw_X_train = vec.fit_transform(train_data['keyword'].values).todense()
kw_X_test = vec.transform(test_data['keyword'].values).todense()

text_X_train = vec.fit_transform(train_data['text'].values).todense()
text_X_test = vec.transform(test_data['text'].values).todense()

In [8]:
import numpy as np


#将文本特征与类别特征的编码进行拼接。
X_train = np.concatenate([loc_X_train, kw_X_train, text_X_train], axis=1)
X_test = np.concatenate([loc_X_test, kw_X_test, text_X_test], axis=1)

In [9]:
'''
采用朴素贝叶斯分类器，并且交叉验证、超参数寻优。
'''
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV


parameters = {'alpha': [0.5, 0.8, 1.0]}

mnb = MultinomialNB()

clf = GridSearchCV(mnb, parameters, scoring='f1', n_jobs=4)

clf.fit(X_train, y_train)

print('最优超参数设定为：%s' %clf.best_params_)
print('交叉验证得到的最佳准确率为：%f' %clf.best_score_)

最优超参数设定为：{'alpha': 1.0}
交叉验证得到的最佳准确率为：0.620542


In [10]:
'''
使用最优的模型，依据测试数据的特征进行类别预测。
'''
y_predict = clf.predict(X_test)

submission = pd.DataFrame({'id': test_data['id'], 'target': y_predict})
submission.to_csv('../Kaggle_submissions/twitter_submission.csv', index=False)