In [None]:
# Data set source: http://ai.stanford.edu/~amaas/data/sentiment/
'''
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
'''

In [None]:
import pandas as pd
import os

## Reading the files and preparing the data sets

In [None]:
def readallcomments(path, flag):
    fl = os.listdir(path)
    print (len(fl))
    ll = []
    for f in fl:
        ff = path  + f
        strr = open(ff, 'r').read()
        ll.append([flag, strr])
    
    lldf = pd.DataFrame(ll)
    lldf.columns = ['Rating', 'Comment']    
    return (lldf)

In [None]:
train_pos = './aclImdb_v1/aclImdb/train/pos/'
train_neg = './aclImdb_v1/aclImdb/train/neg/'

test_pos = './aclImdb_v1/aclImdb/test/pos/'
test_neg = './aclImdb_v1/aclImdb/test/neg/'

In [None]:
trainposdf = readallcomments(train_pos, '1')
print (len(trainposdf))
print (trainposdf.head())

In [None]:
trainnegdf = readallcomments(train_neg, '0')
print (len(trainnegdf))
print (trainnegdf.head())

In [None]:
traindf = trainposdf.append(trainnegdf)
print (len(traindf))
traindf.to_csv('TrainingData.csv', index=False)

In [None]:
testposdf = readallcomments(test_pos, '1')
print (len(testposdf))
print (testposdf.head())

In [None]:
testnegdf = readallcomments(test_neg, '0')
print (len(testnegdf))
print (testnegdf.head())

In [None]:
testdf = testposdf.append(testnegdf)
print (len(testdf))
testdf.to_csv('TestingData.csv', index=False)

## Checking the datasets

In [None]:
i = 100
print (traindf.iloc[i].Rating)
print (traindf.iloc[i].Comment)

In [None]:
i = 24999
print (testdf.iloc[i].Rating)
print (testdf.iloc[i].Comment)

## Text preprocessing

In [None]:
def textclean(text):
    text = text.lower()
    symblst = ['<', '*', '?', '>', "\\", "\'", "\"", ',']
    for s in symblst:
        text = text.replace(s,'')
    #text = text.replace('  ', ' ')
    text = ' '.join(text.split())
        
    return (text)

In [None]:
a = 'This is a > sample comment quote < \\ hello , the movie is worst ? *'
textclean(a)

## First model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=textclean)

### Testing the vectorizer

In [None]:
trainsents  = ['A cat is walking in the rain', 'Dog running on a sunny day']
vectorizer.fit(trainsents)

In [None]:
# get the vectorizer's vocabulary
inv_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
vocabulary = [inv_vocab[i] for i in range(len(inv_vocab))]
print (vocabulary)

In [None]:
testsents = ['rain on a sunny day, and a dog and an elephant are walking']
pd.DataFrame(data=vectorizer.transform(testsents).toarray(),
    index=["test sentence"],
    columns=vocabulary
)

# Fitting the vectorizer on IMDB ratings

In [None]:
training_features = vectorizer.fit_transform(traindf["Comment"])    
test_features = vectorizer.transform(testdf["Comment"])

In [None]:
# Training
model0 = LinearSVC()
model0.fit(training_features, traindf["Rating"])
y_pred = model0.predict(test_features)

In [None]:
# Evaluation
acc = accuracy_score(testdf["Rating"], y_pred)
print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

In [None]:
def get_rating(samplerating, vectorizer, model):
    sr = [samplerating]
    test_features = vectorizer.transform(sr)
    ypred = model.predict(test_features)
    #print (ypred)
    return (ypred[0])

## Testing the results

In [None]:
c0 = 'This is the worst movie I have ever seen. Wonder why I statyed till the end'
get_rating(c0, model0)

In [None]:
c1 = 'This is a very nicely made movie. I liked the editiing'
get_rating(c1,  model0)

In [None]:
c3 = 'this is a not so good movie'
get_rating(c3, model0)

## Improving the model - Another approach - Tf-Idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Transform each text into a vector of word counts
vectorizer1 = TfidfVectorizer(stop_words="english",
                             preprocessor=textclean,
                             ngram_range=(1, 2))

training_features = vectorizer1.fit_transform(traindf["Comment"])    
test_features = vectorizer1.transform(testdf["Comment"])

In [None]:
# Training
model1 = LinearSVC()
model1.fit(training_features, traindf["Rating"])
y_pred = model1.predict(test_features)

# Evaluation
acc = accuracy_score(testdf["Rating"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

In [None]:
c3 = 'this movie is so so'
get_rating(c3, vectorizer, model0)

In [None]:
c3 = 'this movie is so so'
get_rating(c3, vectorizer1, model1)