In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib

from gensim import summarization
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from math import floor,ceil

from sklearn.svm import LinearSVC

from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

In [None]:
def rating_to_polarity(rating):
    if rating > 3:
        return 1
    return 0

def score_to_polarity(score):
    if score < 0:
        return 0
    return 1

def score_to_rating(score):
    if score < -.7:
        return 1
    if score < -.2:
        return 2
    if score < .2:
        return 3
    if score < .7:
        return 4
    return 5
#     rating = score*2+3
#     return int(round(rating))

def get_weight(rating):
    if (rating[5] == 1):
        return 2
    if (rating[1] == 1):
        return 1
    return 1

def get_keywords(text):
    try:
        keywords = summarization.keywords(text,ratio=1.0,split=True)
    except Exception:
        keywords = []
    return ' '.join(keywords)

def categorize(ratings):
    cats = []
    for rating in ratings:
        v = [0,0,0,0,0]
        v[rating-1] = 1
        cats.append(v)
    return np.array(cats)

def generate_random_rating():
    a = np.random.randint(low=1,high=6,size=1)
    return np.mean(a,dtype=np.int32)


In [None]:
data = pd.read_csv('dataset/Reviews_uniform_25000.csv',header=0,index_col=0,encoding='utf-8')
data = data.sample(n=5000,random_state=1)
# data = data[data.Score != 3]
data = data.dropna(how='any')

In [None]:
# summaries = data.Summary
summaries = data.Text
# summaries = data.Text.map(get_keywords)
ratings = data.Score

In [None]:
vectorizer = TfidfVectorizer(max_df=.8)
vectorizer.fit(summaries)

In [None]:
X = vectorizer.transform(summaries).toarray()
# y = ((ratings-3)/2.0).values    # for polarity score
# y = ratings.map(rating_to_polarity).values      # for polarity classification
y = categorize(ratings.values)   # for rating classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)
# w_train = np.array(map(get_weight,y_train))

In [None]:
model = Sequential()
model.add(Dense(128,input_dim=X_train.shape[1]))

# model.add(Dense(1,activation='sigmoid'))         # for polarity classification
# model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

model.add(Dense(5,activation='softmax'))         # for rating classification
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

# model.add(Dense(1,activation='tanh'))         # for polarity score
# model.compile(loss='mean_squared_error',optimizer='rmsprop',metrics=['mean_squared_error'])

model.fit(X_train,y_train,nb_epoch=10,batch_size=32,verbose=1)
# model.fit(X_train,y_train,sample_weight=w_train,nb_epoch=30,batch_size=32,verbose=1)
model.evaluate(X_test,y_test)[1]

In [None]:
preds = model.predict(X_test)
out = []
for i in range(len(preds)):
#     out.append([score_to_rating(preds[i][0]),int(y_test[i]*2+3)])     # for polarity score
#     out.append([int(round(preds[i][0])),y_test[i]])     # for polarity classification
    out.append([preds[i].argmax()+1,y_test[i].argmax()+1])    # for rating classification

out = pd.DataFrame(out,columns=['PredictedRating','ActualRating'])
out['RandomRating'] = pd.Series([generate_random_rating() for _ in range(len(out))])
out['DiffActPred'] = (out.ActualRating - out.PredictedRating).map(abs)
out['DiffActRand'] = (out.ActualRating - out.RandomRating).map(abs)

In [None]:
out[['ActualRating','PredictedRating','RandomRating']].hist()
out[['DiffActPred','DiffActRand']].hist()

In [None]:
print "Dataset size : {:d}".format(len(data))
print "Training set size : {:d}".format(len(X_train))
print "Testing set size : {:d}".format(len(X_test))
print "Accuracy between predicted and actual : {:f}".format(accuracy_score(out.PredictedRating,out.ActualRating))
print "Accuracy between random and actual : {:f}".format(accuracy_score(out.RandomRating,out.ActualRating))
print "Accuracy with +-1 difference between predicted and actual : {:f}".format(float(out.DiffActPred.value_counts()[0]+out.DiffActPred.value_counts()[1])/len(out))
print "Accuracy with +-1 difference between random and actual : {:f}".format(float(out.DiffActRand.value_counts()[0]+out.DiffActRand.value_counts()[1])/len(out))

In [None]:
out.to_csv('output/rating_gen_1.csv')