In [297]:
import pandas as pd
import numpy as np
from sklearn import tree
import jieba
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [298]:
dataset = pd.read_csv("train.csv",encoding="utf-8",sep=",",delimiter="\t")
dataset.head()

Unnamed: 0,target,text,stance
0,开放二胎,刚回家几天就迫不及待的赶到了小舅家，看着乖巧懂事的表妹和可爱的小表弟，心情格外舒畅！这个画面...,FAVOR
1,俄罗斯在叙利亚的反恐行动,俄罗斯就是流氓,AGAINST
2,春节放鞭炮,#春节放鞭炮#【中央气象台首次发布烟花爆竹燃放气象指数】明天就是除夕了，年味越发浓郁。今早，...,AGAINST
3,IphoneSE,iPhoneSE貌似摄像头不外突了，普天同庆,FAVOR
4,春节放鞭炮,千万人口级城市北京，原住民在庆祝第一大节日春节，大家伙都心照不宣的想到：别再给伤痕累累的家乡...,AGAINST


In [299]:
def stopwordslist():
    stopwords = [line.strip() for line in open('cn_stopwords.txt',encoding='UTF-8').readlines()]
    stopwords.extend([line.strip() for line in open('scu_stopwords.txt',encoding='UTF-8').readlines()])
    stopwords.extend([line.strip() for line in open('baidu_stopwords.txt',encoding='UTF-8').readlines()])
    stopwords.extend([line.strip() for line in open('hit_stopwords.txt',encoding='UTF-8').readlines()])
    return stopwords
        
def seg_depart(sentence):
     # 对文档中的每一行进行中文分词
    sentence_depart = jieba.cut(sentence.strip())
     # 创建一个停用词列表
    stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
   # 去停用词
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr        
        

In [300]:
dataset["target"].unique()

array(['开放二胎', '俄罗斯在叙利亚的反恐行动', '春节放鞭炮', 'IphoneSE', '深圳禁摩限电'],
      dtype=object)

In [301]:
dataset["target"] = dataset["target"].map({'开放二胎':0,'俄罗斯在叙利亚的反恐行动':1,'春节放鞭炮':2,'IphoneSE':3, '深圳禁摩限电':4})

In [302]:
dataset["stance"] = dataset["stance"].map({"FAVOR":0,"AGAINST":1,"NONE":2})
dataset["cut"] = dataset["text"].apply(seg_depart)

In [303]:
testset = pd.read_csv("test.csv",delimiter="\t")
testset.head()

Unnamed: 0,target,text
0,IphoneSE,讲真，对iphoneSE很心动，但是又很期待iphone7，心塞
1,春节放鞭炮,传统春节来临传统的拜神，放鞭炮，烧纸，热闹呢
2,俄罗斯在叙利亚的反恐行动,俄罗斯在战争状态下的紧急动员能力，这不是土耳其可以低估的。
3,深圳禁摩限电,珠海要是有这么高强度，市区就不会有那么多摩托车横冲直撞了
4,深圳禁摩限电,#深圳禁摩限电# 早该整了，快递开电车，真把马路当成他们家开的一样...不过他们工作压力大，...


In [304]:
testset["target"] = testset["target"].map({'开放二胎':0,'俄罗斯在叙利亚的反恐行动':1,'春节放鞭炮':2,'IphoneSE':3, '深圳禁摩限电':4})

In [305]:
testset["cut"] = testset["text"].apply(seg_depart)

In [306]:
x_0 = dataset[dataset["target"]==0]
x_1 = dataset[dataset["target"]==1]
x_2 = dataset[dataset["target"]==2]
x_3 = dataset[dataset["target"]==3]
x_4 = dataset[dataset["target"]==4]

In [307]:
x_0

Unnamed: 0,target,text,stance,cut
0,0,刚回家几天就迫不及待的赶到了小舅家，看着乖巧懂事的表妹和可爱的小表弟，心情格外舒畅！这个画面...,0,刚 回家 几天 迫不及待 赶到 小舅 家 看着 乖巧 懂事 表妹 可爱 小表弟 心情 舒畅 ...
9,0,#姚晨怀二胎#恭喜，挑对了时间，又省了罚款。,0,姚晨怀 二胎 恭喜 挑对 时间 省 罚款
15,0,全面开放二胎政策总让人觉得“得不偿失”。优秀的因为这个政策被挤压了发展空间于是愤而起义坚决不...,1,开放 二胎 政策 总让 得不偿失 优秀 政策 挤压 发展 空间 愤而 起义 不生 不养 少生...
17,0,不等到歌会结束啦~俺去补作业了，还是感谢奇然让我一本满足呐！最后的最后的最后，祝二胎茁壮成长...,2,歌会 结束 ~ 补 作业 感谢 奇然 一本 呐 祝 二胎 茁壮成长 ฅ ω ฅ 嘿嘿嘿 ~ ...
19,0,国际知名咨询公司凯度发布研究称，此前，婴儿奶粉业内预估，受二胎政策推动，2015年~2018...,2,国际 知名 咨询 公司 凯度 发布 研究 称 此前 婴儿 奶粉 业内 预估 受 二胎 政策 ...
...,...,...,...,...
2378,0,很多人都在释放什么北上广的房子合起来可以把美国买下之类的危机大预言术，用以佐证中国房价高。那...,0,释放 北上 广 房子 合 美国 买下 危机 预言 术 用以 佐证 中国 房价 高 中国 人口...
2383,0,发表了博文《试管婴儿二胎》您是否也在纠结要二胎？担心二胎成功率，费用？柒月健康小奎详细的开一...,0,发表 博文 试管婴儿 二胎 纠结 二胎 担心 二胎 成功率 费用 柒月 健康 小奎 详细 开...
2384,0,连续三天奔松江，在成就感满满的今天画上句号。七点出发去给本科小朋友上形势政策课，4个小时，讲...,0,连续 三天 奔 松江 成就感 满满的 画上 句号 七点 出发 本科 小朋友 形势 政策 课 ...
2387,0,口味独特不反对！男的都跟男的搞女的都跟女的搞了！怪不得开放二胎,0,口味 独特 反对 男 男 搞 女 女 搞 开放 二胎


In [308]:
y_0 = testset[testset["target"]==0]
y_1 = testset[testset["target"]==1]
y_2 = testset[testset["target"]==2]
y_3 = testset[testset["target"]==3]
y_4 = testset[testset["target"]==4]

In [309]:
y_0

Unnamed: 0,target,text,cut
5,0,#和颐酒店女生遇袭# 我真的对TC有点失望了，开放二胎号召女性回归家庭，这些都是潜意识的洗脑...,颐 酒店 女生 遇袭 真的 TC 失望 开放 二胎 号召 女性 回归 家庭 潜意识 洗脑...
6,0,【北京交通委：油价大跌和网络约车加剧道路拥堵】北京市交通委主任周正宇今天说，八项规定出台后，...,北京 交通委 油价 大跌 网络 约车 加剧 道路 拥堵 北京市 交通委 主任 周正 宇 说 ...
15,0,【女孩遭遇准婆婆奇葩要求 考上研究生才能结婚】准婆婆要求一对90后小情侣写下保证书，保证二人...,女孩 遭遇 准 婆婆 奇葩 考上 研究生 结婚 准 婆婆 一对 90 情侣 写下 保证书...
32,0,虽然开放二胎我觉得是个好事，但是能别再特么煽情说独生子女孤独了吗？老子一点也不孤独啊，就特么...,开放 二胎 好事 能别 特 煽情 说 独生子女 孤独 老子 一点 孤独 特 代表
38,0,4月24日，姚晨在电影《梦想合伙人》首映礼上，宣布怀二胎，十一月将生下二宝，给小土豆添一个小...,月 24 日 姚晨 电影 梦想 合伙人 首映礼 怀 二胎 十一月 将生 下二宝 小土豆 添 ...
...,...,...,...
571,0,【每日话题】随着“二胎政策”开放，生宝宝扎堆。为避免员工集中怀孕，吉林长春一单位定新规：想要...,每日 话题 二胎 政策 开放 生 宝宝 扎堆 员工 怀孕 吉林长春 单位 定新规 想要 孩子...
581,0,办公室瑕姐刚生完小布丁，最近办公室天天聊育儿话题。没想到今天早上老大来超平静的宣布她怀上二胎...,办公室 瑕姐 刚生 完小 布丁 办公室 天天 聊 育儿 话题 没想到 早上 老大 来超 平静...
583,0,#90后断崖式减少#【震惊！中国90后数量现“断崖式减少”，已经成珍惜物种?】“少壮不养孩，...,90 断崖 式 减少 震惊 中国 90 数量 现 断崖 式 减少 成 珍惜 物种 少壮 不养...
587,0,开放二胎后舆论导向越来越没谱了。刚中央七居然在播江西的添丁仪式，杀公鸡，祈求添丁。周围抱着的...,开放 二胎 舆论导向 越来越 没谱 刚 中央 七 在播 江西 添丁 仪式 杀 公鸡 祈求 添...


In [310]:
result = {}

###### 开放二胎 0

In [311]:
X_0 = x_0.drop("stance",axis=1)
Y_0 = x_0["stance"]

In [312]:
def tf_idf(contents):
    # 提取文本特征tf-idf
    vectorizer = CountVectorizer(min_df=1e-5,max_features=1400)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))
    return tfidf

In [313]:
tf_idf(X_0["cut"])

<493x1400 sparse matrix of type '<class 'numpy.float64'>'
	with 6248 stored elements in Compressed Sparse Row format>

In [314]:
x_train, x_test, y_train, y_test = train_test_split(tf_idf(X_0["cut"]),Y_0,random_state=1,test_size =0.2)

In [315]:
x_train.shape

(394, 1400)

贝叶斯

In [316]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(0,fit_prior=False)
for i in np.arange(0.0,1.1,0.1):
    classifier = MultinomialNB(i,fit_prior=True)
#模型训练
    classifier.fit(x_train, y_train)
    scores = classifier.score(x_test, y_test)
    print("alph is {0},score is {1}".format(i,scores))

alph is 0.0,score is 0.5656565656565656
alph is 0.1,score is 0.5454545454545454
alph is 0.2,score is 0.5858585858585859
alph is 0.30000000000000004,score is 0.5959595959595959
alph is 0.4,score is 0.5858585858585859
alph is 0.5,score is 0.5858585858585859
alph is 0.6000000000000001,score is 0.5858585858585859
alph is 0.7000000000000001,score is 0.5858585858585859
alph is 0.8,score is 0.6161616161616161
alph is 0.9,score is 0.6161616161616161
alph is 1.0,score is 0.6161616161616161


  'setting alpha = %.1e' % _ALPHA_MIN)


knn

In [317]:
from sklearn.neighbors import KNeighborsClassifier
for i in range(1,10):
    knc = KNeighborsClassifier(n_neighbors=i)
    knc.fit(x_train, y_train)
    y_predict = knc.predict(x_test)
    score = np.mean(y_predict == y_test)
    print("n_neighbours is {0}, score is {1}".format(i,score))

n_neighbours is 1, score is 0.494949494949495
n_neighbours is 2, score is 0.5454545454545454
n_neighbours is 3, score is 0.5050505050505051
n_neighbours is 4, score is 0.5252525252525253
n_neighbours is 5, score is 0.5555555555555556
n_neighbours is 6, score is 0.5050505050505051
n_neighbours is 7, score is 0.5050505050505051
n_neighbours is 8, score is 0.5252525252525253
n_neighbours is 9, score is 0.5151515151515151


In [318]:
from sklearn.svm import SVC
svm_model = SVC(kernel="linear")
svm_model.fit(x_train, y_train)

preds = svm_model .predict(x_test)
np.mean( preds == y_test)

0.5656565656565656

xgboost

In [319]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(x_train, y_train, verbose=False)
predictions = my_model.predict(x_test)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error : 0.6136006258352839


predict


In [320]:
####读取测试集  预测
classifier = MultinomialNB(1,fit_prior=False)
classifier.fit(x_train, y_train)
res = classifier.predict(tf_idf(y_0["cut"]))

In [321]:
for count,index in enumerate(y_0["cut"].index):
    result[index] = res[count] 

### 1 俄罗斯

In [322]:
X_1 = x_1.drop("stance",axis=1)
Y_1 = x_1["stance"]

In [332]:
def tf_idf(contents):
    # 提取文本特征tf-idf
    vectorizer = CountVectorizer(min_df=1e-5,max_features=910)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))
    return tfidf

In [333]:
x_train, x_test, y_train, y_test = train_test_split(tf_idf(X_1["cut"]),Y_1,random_state=1,test_size =0.2)

In [334]:
###贝叶斯
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(0,fit_prior=False)
for i in np.arange(0.0,1.1,0.1):
    classifier = MultinomialNB(i,fit_prior=True)
#模型训练
    classifier.fit(x_train, y_train)
    scores = classifier.score(x_test, y_test)
    print("alph is {0},score is {1}".format(i,scores))

alph is 0.0,score is 0.47959183673469385
alph is 0.1,score is 0.5204081632653061
alph is 0.2,score is 0.5408163265306123
alph is 0.30000000000000004,score is 0.5510204081632653
alph is 0.4,score is 0.5510204081632653
alph is 0.5,score is 0.5510204081632653
alph is 0.6000000000000001,score is 0.5408163265306123
alph is 0.7000000000000001,score is 0.5306122448979592
alph is 0.8,score is 0.5306122448979592
alph is 0.9,score is 0.5306122448979592
alph is 1.0,score is 0.5204081632653061


  'setting alpha = %.1e' % _ALPHA_MIN)


In [335]:
##knn
from sklearn.neighbors import KNeighborsClassifier
for i in range(1,10):
    knc = KNeighborsClassifier(n_neighbors=i)
    knc.fit(x_train, y_train)
    y_predict = knc.predict(x_test)
    score = np.mean(y_predict == y_test)
    print("n_neighbours is {0}, score is {1}".format(i,score))

n_neighbours is 1, score is 0.30612244897959184
n_neighbours is 2, score is 0.3673469387755102
n_neighbours is 3, score is 0.30612244897959184
n_neighbours is 4, score is 0.3979591836734694
n_neighbours is 5, score is 0.35714285714285715
n_neighbours is 6, score is 0.37755102040816324
n_neighbours is 7, score is 0.2857142857142857
n_neighbours is 8, score is 0.47959183673469385
n_neighbours is 9, score is 0.4387755102040816


In [336]:
##逻辑回归
lr_model = LogisticRegression( multi_class='multinomial', n_jobs=-1,random_state=1)
lr_model.fit(x_train, y_train)

preds = lr_model.predict(x_test)
np.mean( preds == y_test)

0.5612244897959183

In [337]:
##支持向量机
from sklearn.svm import SVC
svm_model = SVC(kernel="linear")
svm_model.fit(x_train, y_train)

preds = svm_model .predict(x_test)
np.mean( preds == y_test)

0.5714285714285714

In [338]:

lr_model = LogisticRegressionCV(solver='newton-cg', multi_class='multinomial', cv=5, n_jobs=-1)
lr_model.fit(x_train, y_train)

preds = lr_model.predict(x_test)
np.mean( preds == y_test)

0.5510204081632653

In [339]:
tf_idf(y_1["cut"]).shape

(110, 910)

In [340]:
preds = svm_model .predict(tf_idf(y_1["cut"]))

In [342]:
for count,index in enumerate(y_1["cut"].index):
    result[index] = preds[count] 

In [291]:
for count,index in enumerate(y_1["cut"].index):
    result[index] = res[count] 

In [343]:
print(len(result))
len(y_0)+len(y_1)

217

217

In [371]:
###2

X_2 = x_2.drop("stance",axis=1)
Y_2 = x_2["stance"]

def tf_idf(contents):
    # 提取文本特征tf-idf
    vectorizer = CountVectorizer(min_df=1e-5,max_features=1900)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))
    return tfidf


In [373]:
x_train, x_test, y_train, y_test = train_test_split(tf_idf(X_2["cut"]),Y_2,random_state=1,test_size =0.2)

In [374]:
###贝叶斯
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(0,fit_prior=False)
for i in np.arange(0.0,1.1,0.1):
    classifier = MultinomialNB(i,fit_prior=True)
#模型训练
    classifier.fit(x_train, y_train)
    scores = classifier.score(x_test, y_test)
    print("alph is {0},score is {1}".format(i,scores))

alph is 0.0,score is 0.6404494382022472
alph is 0.1,score is 0.7303370786516854
alph is 0.2,score is 0.7415730337078652
alph is 0.30000000000000004,score is 0.7415730337078652
alph is 0.4,score is 0.7415730337078652
alph is 0.5,score is 0.7303370786516854
alph is 0.6000000000000001,score is 0.7191011235955056
alph is 0.7000000000000001,score is 0.7078651685393258
alph is 0.8,score is 0.6966292134831461
alph is 0.9,score is 0.6853932584269663
alph is 1.0,score is 0.6853932584269663


  'setting alpha = %.1e' % _ALPHA_MIN)


In [375]:
from sklearn.neighbors import KNeighborsClassifier
for i in range(1,10):
    knc = KNeighborsClassifier(n_neighbors=i)
    knc.fit(x_train, y_train)
    y_predict = knc.predict(x_test)
    score = np.mean(y_predict == y_test)
    print("n_neighbours is {0}, score is {1}".format(i,score))

n_neighbours is 1, score is 0.5955056179775281
n_neighbours is 2, score is 0.5617977528089888
n_neighbours is 3, score is 0.6179775280898876
n_neighbours is 4, score is 0.5955056179775281
n_neighbours is 5, score is 0.5955056179775281
n_neighbours is 6, score is 0.6292134831460674
n_neighbours is 7, score is 0.6853932584269663
n_neighbours is 8, score is 0.6741573033707865
n_neighbours is 9, score is 0.6741573033707865


In [376]:
##逻辑回归
lr_model = LogisticRegression( multi_class='multinomial', n_jobs=-1,random_state=1)
lr_model.fit(x_train, y_train)

preds = lr_model.predict(x_test)
np.mean( preds == y_test)

0.7191011235955056

In [377]:
##支持向量机
from sklearn.svm import SVC
svm_model = SVC(kernel="linear")
svm_model.fit(x_train, y_train)

preds = svm_model .predict(x_test)
np.mean( preds == y_test)

0.7415730337078652

In [378]:

lr_model = LogisticRegressionCV(solver='newton-cg', multi_class='multinomial', cv=5, n_jobs=-1)
lr_model.fit(x_train, y_train)

preds = lr_model.predict(x_test)
np.mean( preds == y_test)

0.7303370786516854

In [379]:
tf_idf(y_2["cut"]).shape

(155, 1900)

In [382]:
preds = svm_model .predict(tf_idf(y_2["cut"]))

In [383]:
for count,index in enumerate(y_2["cut"].index):
    result[index] = preds[count] 

In [384]:
print(len(result))
len(y_0)+len(y_1)+len(y_2)

372


372

### 朴素贝叶斯

In [51]:
for i in np.arange(0.0,1.1,0.1):
    classifier = MultinomialNB(i,fit_prior=False)
#模型训练
    classifier.fit(vectorizer.transform(x_train["cut"] ) + vectorizer.transform(fenci(x_train['target'])), y_train)
    scores = classifier.score(vectorizer.transform(x_test["cut"])+vectorizer.transform(fenci(x_test['target'])), y_test)
    print("alph is {0},score is {1}".format(i,scores))

  'setting alpha = %.1e' % _ALPHA_MIN)


alph is 0.0,score is 0.55
alph is 0.1,score is 0.5770833333333333
alph is 0.2,score is 0.5833333333333334
alph is 0.30000000000000004,score is 0.5833333333333334
alph is 0.4,score is 0.5854166666666667
alph is 0.5,score is 0.5875
alph is 0.6000000000000001,score is 0.5895833333333333
alph is 0.7000000000000001,score is 0.5854166666666667
alph is 0.8,score is 0.58125
alph is 0.9,score is 0.5854166666666667
alph is 1.0,score is 0.5854166666666667


In [52]:
for i in np.arange(0.0,1.1,0.1):
    classifier = MultinomialNB(i,fit_prior=True)
#模型训练
    classifier.fit(vectorizer.transform(x_train["cut"] ) + vectorizer.transform(fenci(x_train['target'])), y_train)
    scores = classifier.score(vectorizer.transform(x_test["cut"])+vectorizer.transform(fenci(x_test['target'])), y_test)
    print("alph is {0},score is {1}".format(i,scores))

  'setting alpha = %.1e' % _ALPHA_MIN)


alph is 0.0,score is 0.55
alph is 0.1,score is 0.5833333333333334
alph is 0.2,score is 0.5833333333333334
alph is 0.30000000000000004,score is 0.5833333333333334
alph is 0.4,score is 0.5833333333333334
alph is 0.5,score is 0.5791666666666667
alph is 0.6000000000000001,score is 0.5854166666666667
alph is 0.7000000000000001,score is 0.5833333333333334
alph is 0.8,score is 0.5875
alph is 0.9,score is 0.58125
alph is 1.0,score is 0.5791666666666667


In [19]:
classifier = MultinomialNB(1,fit_prior=False)
#模型训练
classifier.fit(vectorizer.transform(x_train["cut"] ) + vectorizer.transform(fenci(x_train['target'])), y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=False)

In [20]:
predicted = classifier.predict(vectorizer.transform(x_test["cut"] ) + vectorizer.transform(fenci(x_test['target'])))

In [21]:
test = pd.read_csv("test.csv",delimiter="\t")

In [54]:
test["target"].unique()

array(['IphoneSE', '春节放鞭炮', '俄罗斯在叙利亚的反恐行动', '深圳禁摩限电', '开放二胎'],
      dtype=object)

In [23]:
test["cut"] = test["text"].apply(seg_depart)

In [24]:
test_input  = vectorizer.transform(test["cut"] ) + vectorizer.transform(fenci(test['target']))

In [25]:
import csv

In [26]:
#['FAVOR', 'AGAINST', 'NONE']

In [27]:
with open('key.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f, dialect='excel')
    for index,res in enumerate(classifier.predict(test_input)):
        fin = ""
        if res == 0:
            fin = "FAVOR"
        if res == 1:
            fin = 'AGAINST'
        if res == 2:
            fin = 'NONE'
        writer.writerow([index,fin])





    
    