In [106]:
import jieba
from sklearn.naive_bayes import MultinomialNB as MNB
import re
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LR
from matplotlib import pyplot as plt

----

## ReadCSV

In [4]:
train_data = pd.read_csv('train.csv', lineterminator='\n') # 行分隔符 = '\n'
test_data = pd.read_csv('20190520_test.csv', lineterminator='\n')
train_data.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,Negative
1,2,ya Allah meri sister Affia ki madad farma,Positive
2,3,Yeh khud chahta a is umar main shadi krna. ha...,Negative
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,Negative
4,5,Good,Positive


----

## Data Cleaning

In [5]:
train_data['label'] = train_data['label'].map({'Negative':0, 'Positive':1})

In [6]:
rmSignal = ['.', '?', '!', ':', '-', '+', '/', '"', ',']

In [26]:
def review_to_words(data):
    
    # 正则去除表情
    emoji_pattern = re.compile(u'[\U00010000-\U0010ffff]')
    data = emoji_pattern.sub(u'', data)
    
    # 正则去除标点
    fuhao_pattern = re.compile(u'\.*')
    data = fuhao_pattern.sub(u'', data)
    
    # 正则去除数字
    digit_pattern = re.compile(u'\d+')
    data = digit_pattern.sub(u'', data)
    
    # 空格拆分词语
    words = data.lower().split()
    
    # 去掉rmSignal
    meaningful_words = [w for w in words if not w in rmSignal]
    
    # 将筛分好的词合成一个字符串，并用空格分开
    words = " ".join(meaningful_words)
    return words

clean_train_reviews = []
for i in range(0, len(train_data)):
    clean_train_reviews.append(review_to_words(train_data['review'][i]))
clean_train_reviews[:5]

['jo bhi ap se tou behtar hoon',
 'ya allah meri sister affia ki madad farma',
 'yeh khud chahta a is umar main shadi krna had ogi',
 'tc apky mun xe exe alfax achy nae lgty',
 'good']

In [110]:
tfv = TfidfVectorizer()
train_data_features = tfv.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()
cntWords = sorted(tfv.vocabulary_, key=lambda x:x[0])
cntWords[-10:] # 十个出现频率最高的词语

['اے', 'بیٹھی', 'جیڑی', 'سی', 'ناں', 'کردی', 'کر', 'ھاں', '賭easar', '鄭h']

----

## Model Select

### Multinomial Bayes

In [111]:
model_NB = MNB()
model_NB.fit(train_data_features,train_data["label"])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [112]:
score = np.mean(cross_val_score(model_NB,train_data_features, train_data["label"], cv=20, scoring='roc_auc'))
print("多项式贝叶斯分类器20折交叉验证得分: ",score) # 0.8614076771593228

多项式贝叶斯分类器20折交叉验证得分:  0.8614076771593228


### Logistic Regression

In [113]:
model_LR = LR(random_state = 0)
model_LR.fit(train_data_features,train_data["label"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [114]:
score = np.mean(cross_val_score(model_LR,train_data_features, train_data["label"], cv=20, scoring='roc_auc'))
print("Logistic Regression20折交叉验证得分: ", score) # 0.844763864535499

Logistic Regression20折交叉验证得分:  0.844763864535499


----

## Predict

In [115]:
clean_test_reviews = []
for i in range(0, len(test_data["review"])):
    clean_test_reviews.append(review_to_words(test_data["review"][i]))
clean_test_reviews[:5]

['phr tissuw se saaf',
 'jail road per firing se shakhs janbahaq',
 'mehfil loot li aunty ne',
 'rehnay do butt sahb nay galiya boht deni hain',
 'zabardast']

In [116]:
test_data_features = tfv.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [117]:
result = model_NB.predict_proba(test_data_features)
#print(result[:,1])
output = pd.DataFrame(data={"ID":test_data["ID"], "Pred":result[:,1]})
output.to_csv("out_model.csv", index=False, quoting=3)