In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#from textpreprocess import denoise_text, normalize, replace_contractions, remove_non_ascii, to_lowercase, remove_punctuation, replace_numbers, remove_stopwords
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import nltk
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import random
from xgboost import XGBClassifier as XGBoostClassifier  
from sklearn.pipeline import Pipeline
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\XCHEN038\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
labels = pd.DataFrame(pd.read_csv('movieslabel.csv'))
process_data = pd.DataFrame(pd.read_csv('review_new.csv'))

In [3]:
data = pd.merge(labels,process_data,how='left',on=['movie','title','author'])
#column 'text' is data after preprocessing


In [4]:
def tokenize_row(row):
    row['tokenized_text'] = []+(nltk.word_tokenize(str(row['text'])))
    return row
data = data.apply(tokenize_row,axis=1)

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\XCHEN038\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\XCHEN038\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# build word list
# build by simple count of occurences of every unique word in the dataset.
from collections import Counter

whitelist = ["n't","not"]
wordlist = []
stopwords=nltk.corpus.stopwords.words("english")
words = Counter()
for index,row in data.iterrows():
    words.update(row['tokenized_text'])

# remove stop words
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]

# change the minimum number to decrease dimensions of bag-of-words matrix
min_occurrences=1
max_occurences=5000
word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                     "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                               columns=["word", "occurrences"])
wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]
word_df.head(5)

Unnamed: 0,word,occurrences
0,movie,3608
1,film,3090
2,one,1643
3,frank,1567
4,dicaprio,1466


## Model Generating

In [7]:
import pandas as pd

def build_data_model(datainput, wordlist):
    label_column=["label"]
    columns=label_column+list(map(lambda w: w,wordlist))
    labels=[]
    rows=[]
    index=datainput.shape[0]
    for i in range(index):
        #print(i,type(i))
        current_row=[] 
        # add label
        current_label = datainput.at[i, "label"]
        labels.append(current_label)
        current_row.append(current_label)
        # add bag-of-words; use term presence feature (0 or 1)
        tokens=datainput.at[i, 'tokenized_text']
        for word in wordlist:
            current_row.append(1 if word in tokens else 0)
        rows.append(current_row)
    print("The lengh of columns is:",len(columns))
    print("The lengh of rows is:",len(rows))
    data_model = pd.DataFrame(rows, columns=columns)
    data_labels = pd.Series(labels)
    return data_model, data_labels
data_model_bow,data_model_labels=build_data_model(data,wordlist)
train_model_bow, test_model_row = model_selection.train_test_split(data_model_bow, random_state = 666)


# train_model_bow,train_model_labels=build_data_model(train_data,wordlist)
# test_model_bow,test_model_labels=build_data_model(train_data,wordlist)
# train_model_bow.head(5)

The lengh of columns is: 9636
The lengh of rows is: 1830


In [8]:
train_model_bow=data_model_bow.sample(frac=0.8, random_state=666)
test_model_bow = data_model_bow[~data_model_bow.index.isin(train_model_bow.index)]
train_model_bow.head(5)

Unnamed: 0,label,movie,film,one,frank,dicaprio,like,story,good,would,...,designers,1000,cinematograph,chicanery,smallscale,solves,occasion,bonds,barrister,excellency
1388,neg,1,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1418,pos,0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1155,pos,1,1,1,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1774,pos,0,1,1,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
599,pos,1,1,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
train_model_bow['label'].value_counts()

pos    1015
neg     449
Name: label, dtype: int64

In [10]:
test_model_bow['label'].value_counts()

pos    249
neg    117
Name: label, dtype: int64

In [11]:
from time import time
def test_classifier(X_train, y_train, X_test, y_test, classifier):
    log("")
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    log("Testing " + classifier_name)
    now = time()
    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)
    log("Learing time {0}s".format(time() - now))
    now = time()
    predictions = model.predict(X_test)
    log("Predicting time {0}s".format(time() - now))

    precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    log("=================== Results ===================")
    log("           Negative   Positive")
    log("F1       " + str(f1))
    log("Precision" + str(precision))
    log("Recall   " + str(recall))
    log("Accuracy " + str(accuracy))
    log("===============================================")

    return precision, recall, accuracy, f1

def log(x):
    #can be used to write to log file
    print(x)

In [12]:
from sklearn.model_selection import StratifiedKFold

def cv(classifier, X_train, y_train):
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    now = time()
    log("Crossvalidating " + classifier_name + "...")
    skf = StratifiedKFold(n_splits=10)
    accuracy = [cross_val_score(classifier, X_train, y_train, cv=skf, n_jobs=-1)]
    log("Crosvalidation completed in {0}s".format(time() - now))
    log("Accuracy: " + str(accuracy[0]))
    log("Average accuracy: " + str(np.array(accuracy[0]).mean()))
    log("===============================================")
    return accuracy

## Model building and testing

In [13]:
X_train = train_model_bow.iloc[:, 1:]
X_test = test_model_bow.iloc[:, 1:]
y_train = train_model_bow.iloc[:, 0]
y_test = test_model_bow.iloc[:, 0]

In [14]:
def linkup(x):
    outputlist = []
    for i in range(x.shape[0]):
        a = x.iloc[i,:]
        link = ""
        item = 0
        for j in a:
            if j != 0:
                link += (X_train.columns[item]) + ' '
                item += 1
            elif j == 0:
                item += 1
        outputlist.append(link)
    return outputlist

X_train_link = pd.DataFrame(data = linkup(X_train))
X_test_link = pd.DataFrame(data = linkup(X_test))

In [15]:
X_train_link = X_train_link.iloc[:, 0]
X_test_link = X_test_link.iloc[:, 0]

In [16]:
#count vectors
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
Xtrain_count =  count_vect.fit_transform(X_train_link)
Xtest_count =  count_vect.transform(X_test_link)

In [17]:
#TF-IDF
#wordlevel
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
Xtrain_tfidf =  tfidf_vect.fit_transform(X_train)
Xtest_tfidf =  tfidf_vect.transform(X_test)
# characters level tf-idf
# Regular expression denoting what constitutes a “token”, only used if analyzer == 'word'. 
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
Xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit(X_train) 
Xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_test) 

### NaiveBayes (Bernoulli NB)

In [18]:
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


Testing BernoulliNB
Learing time 1.382483959197998s
Predicting time 0.2240149974822998s
           Negative   Positive
F1       [0.62385321 0.84046693]
Precision[0.67326733 0.81509434]
Recall   [0.58119658 0.86746988]
Accuracy 0.7759562841530054


In [19]:
nb_BNB_acc = cv(BernoulliNB(), train_model_bow.iloc[:,1:], train_model_bow.iloc[:,0]) # use all the data for cv

Crossvalidating BernoulliNB...
Crosvalidation completed in 22.331790685653687s
Accuracy: [0.79591837 0.70748299 0.78231293 0.79591837 0.74829932 0.82876712
 0.74657534 0.7739726  0.82876712 0.75862069]
Average accuracy: 0.776663485422511


### NaiveBayes (Multinomial NB)

In [20]:
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, MultinomialNB())


Testing MultinomialNB
Learing time 0.3277566432952881s
Predicting time 0.04687309265136719s
           Negative   Positive
F1       [0.66949153 0.84274194]
Precision[0.66386555 0.84615385]
Recall   [0.67521368 0.83935743]
Accuracy 0.7868852459016393


In [21]:
nb_MNB_acc = cv(MultinomialNB(), train_model_bow.iloc[:,1:], train_model_bow.iloc[:,0])

Crossvalidating MultinomialNB...
Crosvalidation completed in 17.940839767456055s
Accuracy: [0.81632653 0.72789116 0.7755102  0.82312925 0.75510204 0.81506849
 0.80821918 0.82191781 0.81506849 0.73793103]
Average accuracy: 0.7896164190758967


### Random Forest

In [22]:
seed = 667
random.seed(seed)

In [23]:
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, RandomForestClassifier(random_state=seed,n_estimators=403,n_jobs=-1))


Testing RandomForestClassifier
Learing time 2.2452750205993652s
Predicting time 0.28252744674682617s
           Negative   Positive
F1       [0.51219512 0.85915493]
Precision[0.89361702 0.76489028]
Recall   [0.35897436 0.97991968]
Accuracy 0.7814207650273224


In [24]:
rf_acc = cv(RandomForestClassifier(n_estimators=403,n_jobs=-1, random_state=seed),train_model_bow.iloc[:, 1:], train_model_bow.iloc[:, 0])

Crossvalidating RandomForestClassifier...
Crosvalidation completed in 14.530511856079102s
Accuracy: [0.78231293 0.72789116 0.78231293 0.80272109 0.74829932 0.78082192
 0.80821918 0.76027397 0.7739726  0.77241379]
Average accuracy: 0.7739238879302311


### SVM

In [25]:
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, svm.SVC(kernel='linear'))


Testing SVC
Learing time 46.291637897491455s
Predicting time 6.406602382659912s
           Negative   Positive
F1       [0.61611374 0.84452975]
Precision[0.69148936 0.80882353]
Recall   [0.55555556 0.88353414]
Accuracy 0.7786885245901639


In [26]:
svm_acc = cv(svm.SVC(kernel='linear'), train_model_bow.iloc[:,1:], train_model_bow.iloc[:,0])

Crossvalidating SVC...
Crosvalidation completed in 41.44939708709717s
Accuracy: [0.7755102  0.76190476 0.7755102  0.84353741 0.7755102  0.76712329
 0.78082192 0.80821918 0.78767123 0.80689655]
Average accuracy: 0.7882704957278139


### XGBoost

In [27]:
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, XGBoostClassifier(seed=seed))


Testing XGBClassifier
Learing time 124.9507966041565s
Predicting time 0.5690076351165771s
           Negative   Positive
F1       [0.55681818 0.85971223]
Precision[0.83050847 0.77850163]
Recall   [0.41880342 0.95983936]
Accuracy 0.7868852459016393


In [28]:
xgb_acc = cv(XGBoostClassifier(seed=seed),train_model_bow.iloc[:, 1:], train_model_bow.iloc[:, 0])

Crossvalidating XGBClassifier...
Crosvalidation completed in 96.87708687782288s
Accuracy: [0.76190476 0.76870748 0.7755102  0.81632653 0.76870748 0.7739726
 0.78082192 0.76712329 0.79452055 0.83448276]
Average accuracy: 0.7842077577370106


### Grid Search

In [29]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=20000)),
    ('clf', svm.SVC())
])
parameters = {
#     'vect__max_df': (0.1, 0.5, 1.0),
#     'vect__stop_words': ('english', None),
#     'vect__lowercase': (True, False),
#     'vect__binary': (True, False),
#     'vect__max_features': (15000, 17500),
#     'vect__ngram_range': ((1, 1), (1, 2),),
#     'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2'),
    'clf__C': (0.01, 1, 10),
#     'clf__gamma: (0.5, 1, 2, 3, 4),
    'clf__kernel': ('rbf', 'linear')
}

if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=10)
    print (len(X_train),len(y_train))
    grid_search.fit(X_train_link, y_train)    
    print('Best score: %0.3f' % grid_search.best_score_)
    print('Best parameters set:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))
    
    # Refit an estimator using the best found parameters on the whole dataset.
    # The refitted estimator is made available at the best_estimator_attribute and 
    # permits using predict diretly on this GridSearchCV instance.
   
    predictions = grid_search.predict(X_test_link)
    print("prediction is done")
    print('Accuracy:', accuracy_score(y_test, predictions))
    print('Precision:', precision_score(y_test, predictions, pos_label="neg"))
    print('Recall:', recall_score(y_test, predictions, pos_label="neg"))
    print('F1_score:', f1_score(y_test, predictions, pos_label="neg"))

1464 1464
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   41.4s finished


Best score: 0.822
Best parameters set:
	clf__C: 1
	clf__kernel: 'linear'
	vect__norm: 'l2'
prediction is done
Accuracy: 0.8114754098360656
Precision: 0.8076923076923077
Recall: 0.5384615384615384
F1_score: 0.6461538461538461


### Grid Search for Deep Learning

In [30]:
#!pip install tensorflow --user

In [31]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot

def tokenize_row(row):
    row['text'] = str(row['text'])
    tokenizer = Tokenizer(num_words=1000)
    tokenizer.fit_on_texts(row['text'])
    word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(row['text']) # turns strings into lists of integer indices
    row['sequences'] = sequences
    one_hot_results = tokenizer.texts_to_matrix(row['text'], mode='binary') # one-hot binary representation
    row['one_hot'] = one_hot_results
    return row

data = data.apply(tokenize_row,axis=1)
data.head(5)

Using TensorFlow backend.


Unnamed: 0,movie,title,rate_x,author,date_x,content_x,label,rate_y,date_y,content_y,mid,text,tokenized_text,sequences,one_hot
0,theBeach,Do not miss this movie just because of its low...,10/10,icysky44,5/7/2005,"Recently a friend of mine watched ""The Beach"" ...",pos,10/10,5/7/2005,"Recently a friend of mine watched ""The Beach"" ...",friend mine watched beach told favorite quote ...,friend mine watched beach told favorite quote ...,"[friend, mine, watched, beach, told, favorite,...","[[20], [8], [3], [1], [6], [13], [], [10], [3]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,theBeach,The Beach WILL NOT DIE TODAY,8/10,ecwjedi,14/6/2005,The Beach is interesting in that some people f...,pos,8/10,14/6/2005,The Beach is interesting in that some people f...,beach interesting people feel terrible others ...,beach interesting people feel terrible others ...,"[beach, interesting, people, feel, terrible, o...","[[14], [1], [3], [12], [10], [], [2], [9], [7]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,theBeach,Unfairly Panned; A Good Movie,9/10,ccthemovieman-1,25/10/2006,Here is another of those films that got panned...,pos,9/10,25/10/2006,Here is another of those films that got panned...,another films got panned critics place liked m...,another films got panned critics place liked m...,"[another, films, got, panned, critics, place, ...","[[4], [5], [7], [3], [13], [1], [6], [], [15],...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,..."
3,theBeach,"Pretty good, despite all the bad reviews I've ...",7/10,stamper,4/6/2000,"I went to this one, knowing that it could be c...",pos,7/10,4/6/2000,"I went to this one, knowing that it could be c...",went one knowing could crap hey care others th...,went one knowing could crap hey care others th...,"[went, one, knowing, could, crap, hey, care, o...","[[16], [1], [4], [8], [], [2], [4], [1], [], [...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,theBeach,"If you travel, you'll understand",8/10,emilyzongqi37,12/2/2011,"When you travel, you are surrounded by people....",pos,8/10,12/2/2011,"When you travel, you are surrounded by people....",travel surrounded people amidst foreign intrig...,travel surrounded people amidst foreign intrig...,"[travel, surrounded, people, amidst, foreign, ...","[[2], [3], [5], [19], [1], [8], [], [7], [10],...","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [32]:
data['label'].replace('pos',1,inplace=True)
data['label'].replace('neg',0,inplace=True)

train_data = data.sample(frac=1,random_state=666)
x_train_ml = train_data.iloc[:,-2]
y_train_ml = train_data.iloc[:,6]

In [33]:
import numpy as np
test_data = data[~data.index.isin(train_data.index)]
x_test_ml = test_data.iloc[:,-2]
y_test_ml = test_data.iloc[:,6]
def ToList(row):
    one_list = []
    for m in row:
        for n in m:
            one_list.append(n)
    return one_list
x_test_ml=x_test_ml.apply(ToList)
x_train_ml=x_train_ml.apply(ToList)

In [34]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD
from keras.constraints import maxnorm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from keras import optimizers

In [35]:
maxlen=1830
x_train_ml = sequence.pad_sequences(x_train_ml, maxlen=maxlen)
x_test_ml = sequence.pad_sequences(x_test_ml, maxlen=maxlen)
print('input_train shape:', x_train_ml.shape)
print('input_test shape:', x_test_ml.shape)
input_dim = x_train_ml.shape[1]

input_train shape: (1830, 1830)
input_test shape: (0, 1830)


In [36]:
def create_model(init='glorot_uniform',lr=0.01,momemntum=0):
    activation ='relu'
    dropout_rate = 0.0
    init_mode = 'uniform'
    weight_constraint = 0
    optimizer = 'adam'
    lr = 0.01
    momemntum = 0
    model = Sequential()
    model.add(Dense(8,
                    input_dim=input_dim, kernel_initializer=init_mode, 
                    activation=activation,
                    kernel_constraint=maxnorm(weight_constraint)))
    sgd = optimizers.SGD(lr=lr)
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer=init_mode,activation='sigmoid'))

    model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['acc'])

    return model

In [37]:
model = KerasClassifier(build_fn=create_model,batch_size=128,epochs=15)

In [38]:
activation = ['relu','sigmoid','linear']
momentum = [0,0.2,0.4,0.8]
lr = [0.001, 0.01, 0.05,0.1, 0.2, 0.3]
dropout_rate = [0.0, 0.2, 0.4, 0.6, 0.8]
weight_constraint=[2, 3]
neurons = [5, 10, 15]
init = ['uniform',  'normal', 'zero','xavier']
optimizer = [ 'SGD', 'RMSprop','Adam']

In [39]:
# grid search epochs, batch size
epochs = [10, 15,20,30] # add 50, 100, 150 etc
batch_size = [15,20,40] # add 5, 10, 20, 40, 60, 80, 100 etc
param_grid = dict(epochs=epochs, batch_size=batch_size,init=init,lr=lr,momemntum=momentum)

In [40]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(x_train_ml, y_train_ml, validation_split=0.2) 

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Train on 1464 samples, validate on 366 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best: 0.690710 using {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.001, 'momemntum': 0}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.001, 'momemntum': 0}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.001, 'momemntum': 0.2}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.001, 'momemntum': 0.4}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.001, 'momemntum': 0.8}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.01, 'momemntum': 0}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.01, 'momemntum': 0.2}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 10, 'init': 'uniform', 'lr': 0.01, 'm

0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.1, 'momemntum': 0.2}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.1, 'momemntum': 0.4}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.1, 'momemntum': 0.8}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.2, 'momemntum': 0}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.2, 'momemntum': 0.2}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.2, 'momemntum': 0.4}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.2, 'momemntum': 0.8}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.3, 'momemntum': 0}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': 'zero', 'lr': 0.3, 'momemntum': 0.2}
0.690710 (0.015514) with: {'batch_size': 15, 'epochs': 20, 'init': '

## Final Prediction

In [41]:
TBPredict = pd.DataFrame(pd.read_excel('review_new.xlsx'))
clf = svm.SVC(C=1.0, kernel = 'linear')
vect = TfidfVectorizer(token_pattern = r'\w{1,}', max_features = 9502)
TBPredictVect = TBPredict.loc[:, 'text']
predictdata = vect.fit_transform(TBPredictVect.values.astype('U'))
X_train_link_vect = vect.fit_transform(X_train_link.values.astype('U'))

In [42]:
clf.fit(X_train_link_vect, y_train)
print('Predicting......')
predictions = clf.predict(predictdata)
print ("prediction complete, file saved as result.csv")
predictions = pd.DataFrame(predictions)
predictions.to_csv('result.csv')

Predicting......
prediction complete, file saved as result.csv
