In [2]:
import numpy as np
import pandas as pd
import collections
import xlwings as xw
from scipy.sparse import vstack
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

def build_dataset(labels, FV):
    result = collections.Counter(labels)
    print(result)
    ratings = list(result.keys())
    counts = list(result.values())
    X_train = np.ones((1, FV.shape[1]))
    X_test = np.ones((1, FV.shape[1]))
    Y_train = np.ones((1,1))
    Y_test = np.ones((1,1))
    labels = labels.reshape(-1,1)
    arr_index = -1

    for i in range(0, len(counts)):
        num_train = int(counts[i]*2/3)
        num_test = counts[i] - num_train
        X_train = vstack([X_train, FV[arr_index+1:arr_index+num_train+1][:]]).toarray()
        Y_train = np.concatenate((Y_train, labels[arr_index+1:arr_index+num_train+1]), axis=0)
        arr_index += num_train
        X_test = vstack([X_test, FV[arr_index+1:arr_index+num_test+1][:]]).toarray()
        Y_test = np.concatenate((Y_test, labels[arr_index+1:arr_index+num_test+1]), axis=0)
        arr_index += num_test

    X_train = X_train[1:][:]
    X_test = X_test[1:][:]
    Y_train = Y_train[1:][:]
    Y_test = Y_test[1:][:]
    
    return X_train, X_test, Y_train, Y_test

corpus = list(pd.read_excel('train_set_2.xlsx')['Segement'].values)
# labels_1 = pd.read_excel('train_set_2.xlsx')['Label_1'].values # array type
labels_2 = pd.read_excel('train_set_2.xlsx')['Label_2'].values # array type

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
word = vectorizer.get_feature_names()
transformer = TfidfTransformer()

# get tfidf feature matrix
tfidf = transformer.fit_transform(X)

# get LIWC feature matrix and normalize
wb = xw.Book('train_set_2.xlsx')
sht = wb.sheets['dataset']
rng = sht.range('E2').expand('table')
arr_feature = np.asarray(rng.value)
scaler = MinMaxScaler()
scaler.fit(arr_feature)
LIWC = scaler.transform(arr_feature)

# build tfidf + 3 class dataset 
X_train1, X_test1, Y_train1, Y_test1 = build_dataset(labels_2, tfidf)

# build tfidf + regression dataset
# X_train2, X_test2, Y_train2, Y_test2 = build_dataset(labels_1, tfidf)

# build LIWC + 3 classes dataset 
X_train3, X_test3, Y_train3, Y_test3 = build_dataset(labels_2, LIWC)

# build LIWC + regression dataset 
# X_train4, X_test4, Y_train4, Y_test4 = build_dataset(labels_1, LIWC)

print(X_train1.shape)

Counter({1: 930, 0: 930, -1: 930})
Counter({1: 930, 0: 930, -1: 930})
(1860, 11154)


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression

# train model_1 -- tfidf + NB
clf1 = MultinomialNB()
clf1.fit(X_train1, Y_train1.flatten())

# train model_2 -- regression 
# reg1 = LinearRegression().fit(X_train2, Y_train2.flatten())

# train model_3 -- LIWC + NB
clf2 = MultinomialNB()
clf2.fit(X_train3, Y_train3.flatten())

# train model_4 -- LIWC + regression
# reg2 = LinearRegression().fit(X_train4, Y_train4.flatten())


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

def cal_conf(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, labels=[-1, 0, 1])

MA1 = clf1.score(X_test1, Y_test1.flatten())
predicts1 = clf1.predict(X_test1)
conf1 = cal_conf(Y_test1, predicts1)
print('model_1 MA:', MA1)
print('model_1 conf:\n', conf1)

# MSE1 = mean_squared_error(Y_test2, reg1.predict(X_test2))
# print('\nmodel_2 MSE:', MSE1)

MA2 = clf2.score(X_test3, Y_test3.flatten())
predicts2 = clf2.predict(X_test3)
conf2 = cal_conf(Y_test3, predicts2)
print('\nmodel_3 MA:', MA2)
print('model_3 conf:\n', conf2)

# MSE2 = mean_squared_error(Y_test4, reg2.predict(X_test4))
# print('\nmodel_4 MSE:', MSE2)


model_1 MA: 0.6505376344086021
model_1 conf:
 [[206  60  44]
 [ 81 183  46]
 [ 58  36 216]]

model_3 MA: 0.6
model_3 conf:
 [[225  45  40]
 [107 178  25]
 [ 92  63 155]]


In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

C_range = np.logspace(-2, 5, 8)
gamma_range = np.logspace(-5, 2, 8)
tuned_parameters = [{'kernel': ['rbf'], 'gamma': gamma_range,
                     'C': C_range},
                    {'kernel': ['linear'], 'C': C_range}]
score = 'accuracy'
datasets = []
# X1 = np.concatenate((X_train1, X_test1), axis=0)
# Y1 = np.concatenate((Y_train1, Y_test1), axis=0)
X3 = np.concatenate((X_train3, X_test3), axis=0)
Y3 = np.concatenate((Y_train3, Y_test3), axis=0)
datasets.append([X1, Y1.flatten()])
# datasets.append([X3, Y3.flatten()])

# train for dataset 1 and 3
for dataset in datasets:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    grid = GridSearchCV(SVC(), tuned_parameters, scoring=score, n_jobs=-1, verbose=1)
    grid.fit(dataset[0], dataset[2])

    print("Best parameters set found on training set:")
    print()
    print(grid.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The scores are computed on the full test set.")
    print()
    y_true, y_pred = dataset[3], grid.predict(dataset[1])
    print(classification_report(y_true, y_pred))
    print()


TypeError: only integer scalar arrays can be converted to a scalar index