In [1]:
import jieba
import re
import copy
from datetime import datetime
from pymongo import MongoClient
import pprint
import jieba.posseg as pseg

In [2]:
### stopword list
stopwords_list = [line.strip() for line in open('stopwords.txt',"r").readlines()]

### syn dict
syn_dict = {}
with open("syn.txt","r") as f :
    for line in f:
        for word in line.strip("\n").split("\t")[1:]:
            syn_dict[word] = line.strip("\n").split("\t")[0]

### 中文斷詞字典
jieba.set_dictionary("dict.txt")

In [3]:
def stopwords(w):
    if w not in stopwords_list:
        return w

def syn(w):
    if w in syn_dict.keys():
        w=syn_dict[w]
        return w
    else:
        return w

def cut(news):
    w = jieba.cut(news, cut_all=False ,HMM=True)
    return w

def regular(w):
    line = re.findall('[\u4e00-\u9fa5]+', w)
    if len(line) > 0:
        return line

In [4]:
def text_cleaning(paragraph):
    me_words = []
    words = cut(paragraph) 
    for w in words:
        if len(w)>=1:
            w = regular(w)
            if w is not None:
                w_stopwords = stopwords(w[0])
                if w_stopwords is not None:
                    w_syn = syn(w_stopwords)
                    me_words.append(w_syn)
    return " ".join(me_words)

In [5]:
def cleaned_news(news_list):
    cleaned_news_list = []
    for news in copy.deepcopy(news_list):
        if 'message' in news:
            news["message"] = text_cleaning(news["message"])
            cleaned_news_list.append(news)
    return cleaned_news_list

# data

In [1]:
with open("sentiment.csv","r") as f:
    r = f.read()

data = [m.split(",")[0] for m in r.split("\n") if len(m.split(",")[0])!=0]
len(data)

14712

In [2]:
from sklearn.feature_extraction.text  import  CountVectorizer  
from sklearn.model_selection import train_test_split
import numpy as np

### X資料(message)
vectorizer = CountVectorizer(min_df=1, token_pattern='(?u)\\b\\w+\\b')  
X = vectorizer.fit_transform(data)
X = X.toarray()

## y資料(sentiment)
y=[m.split(",")[1] for m in r.split("\n") if len(m.split(",")[0])!=0]
y=np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# naive_bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB
clf1 = MultinomialNB().fit(X_train,y_train)

In [9]:
from sklearn.metrics import accuracy_score
prediction = clf1.predict(X_train)
print('train_Accuracy: %.2f' % accuracy_score(prediction, y_train))

train_Accuracy: 0.98


In [10]:
prediction_test = clf1.predict(X_test)
print('test_Accuracy: %.2f' % accuracy_score(prediction_test, y_test))

test_Accuracy: 0.82


In [11]:
input_message = vectorizer.transform(["""你好"""]).toarray()

print(clf1.predict(input_message))
print(clf1.predict_proba(input_message))

['0']
[[ 0.56869983  0.05407292  0.37722725]]


# standard

In [3]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() 
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)



# logisticRegression

In [13]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()  
classifier.fit(X_train_std, y_train)  

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import accuracy_score
pred = classifier.predict(X_train_std)
pred_test = classifier.predict(X_test_std)
print('train_Accuracy: %.2f' % accuracy_score(pred, y_train))
print('test_Accuracy: %.2f' % accuracy_score(pred_test, y_test))

train_Accuracy: 1.00
test_Accuracy: 0.83


# PCA

In [5]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.fit_transform(X_test_std)

# SVM

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1.0, random_state=0, gamma=1)
svm.fit(X_train_std, y_train)

In [None]:
from sklearn.metrics import accuracy_score
pred = svm.predict(X_train_std)
pred_test = svm.predict(X_test_std)
print('train_Accuracy: %.2f' % accuracy_score(pred, y_train))
print('test_Accuracy: %.2f' % accuracy_score(pred_test, y_test))