In [7]:
import gensim
import MeCab
import sys
import numpy as np
import pandas as pd
import codecs
import math
from sklearn.svm import SVC

In [8]:
def pre_process(train_file):
    with codecs.open(train_file, "r", "UTF-8", "ignore") as file:  #"Shift-JIS"
        df = pd.read_table(file, delimiter=",")
    
    df=df.dropna()
    train_data = df.loc[:,"title"].values
    target_data = df.loc[:,"target"].values
    return train_data, target_data

In [9]:
def vectorize_data(input_data):
    model_gensim = gensim.models.KeyedVectors.load("models/w2v_model.bin")
    vect_data = []
    m = MeCab.Tagger("-Owakati")
    for sentense in input_data:
        temp = m.parse(sentense)
        temp_list = temp.split()
        sum = 0
        for word in temp_list:
            try: 
                sum = sum + model_gensim[word]
            except:
                # print(word)
                pass
        sum = sum/len(temp_list)
        vect_data.append(sum)

    vect_data = np.array(vect_data)
    return vect_data

In [10]:
def build_model(vector_data300,target_data):
    clf = SVC(gamma='auto',probability=True,random_state=43)
    model = clf.fit(vector_data300, target_data)
    return model

In [17]:
def predict(test_file,svc1):
    df_test = pd.read_csv(test_file)
    test_data = df_test.loc[:,"title"].values

    vect_test_data = vectorize_data(test_data)    
    pred = svc1.predict_proba(vect_test_data)

    pred_pd =  pd.Series(pred[0:len(pred),1])
    test_pd = pd.Series(test_data)
    test_pred = pd.concat([test_pd, pred_pd] ,axis=1)
    test_pred.columns = ["title","score"]

    test_pred = test_pred.sort_values("score",ascending=False)
    result = test_pred[test_pred["score"] >0.6]
    #　結果の記録
    test_pred.to_csv("data/result_0427.csv",mode="w",encoding="utf_8_sig")
    
    return result

In [18]:
train_file = "data/train_target_data03010426.csv"
test_file = "data/matome_rss_20200427.csv"
    
def main():

    train_data,target_data = pre_process(train_file)
    vect_train_data = vectorize_data(train_data)
    
    svc1 = build_model(vect_train_data, target_data)
    result = predict(test_file, svc1)
    
    count =0 
    for i in result.index:
        print(result.loc[i,"score"].round(4),result.title[i])
        count += 1
        if count >=10 :
            break

main()

0.8611 アメリカ太平洋空軍初のF-35A部隊、アラスカに誕生…第354戦闘航空団！
0.8039 日銀が追加金融緩和 上限なく国債購入 政府の経済対策に連動
0.7688 【新型コロナ】インターハイ史上初の中止
0.7629 イエメン停戦を1カ月延長、サウジアラビア主導のアラブ連合軍…新型コロナ確認後も衝突続く！
0.7472 【速報】中国で甲殻類に感染する謎のウイルス蔓延開始ｗｗｗｗ
0.7427 強襲揚陸艦「アメリカ」の飛行甲板では海兵隊員らがゴルフ練習で休息！
0.7203 氷河期世代とかいう日本の闇ｗｗｗ
0.7112 米ボーイング社、ブラジルの航空機大手エンブラエルとの事業統合を中止！
0.7015 【新型コロナ】緊急事態宣言　政府内「来月6日全面解除は困難」
0.6852 地球を侵略しにきた宇宙人←こいつら


### model_gensimについて

In [17]:
# model_gensim = gensim.models.KeyedVectors.load_word2vec_format("cc.ja.300.vec.gz",binary=False)
# model_gensim.save("models/w2v_model.bin")
### model_gensim をmodelsフォルダにsave後、loadして使用。
# model_gensim = gensim.models.KeyedVectors.load("models/w2v_model.bin")
