In [1]:
import pandas as pd
# train Data
trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
# test Data
testData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv")

In [2]:
trainData.sample(frac=1).head(5) # shuffle the df and pick first 5

Unnamed: 0,Content,Label
865,"mary norton's children's book series , the bor...",pos
1142,"although i had not been a viewer of the "" rugr...",neg
915,the yet-to-be-released krippendorf's tribe is ...,neg
1206,what hath kevin williamson wrought ? \nwhile t...,neg
928,my first press screening of 1998 and already i...,neg


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(trainData['Content'])
test_vectors = vectorizer.transform(testData['Content'])

In [21]:
feature_names = vectorizer.get_feature_names()

In [22]:
doc = 0
feature_index = test_vectors[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [test_vectors[doc, x] for x in feature_index])

In [23]:
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print (w, s)

17 0.05326430977742549
abandoned 0.055290195951019645
above 0.04198441657836493
abused 0.07318928217321687
acclaimed 0.059593726550911635
across 0.038575883586607004
adults 0.05003838852785237
after 0.018678515472502925
against 0.03337191740607857
agrees 0.05215686025418783
air 0.0441028883047004
along 0.02904444156249122
also 0.03198980839440568
am 0.041384017507043035
america 0.0416375423003474
american 0.03195218996289467
amusing 0.04464342415555746
andrea 0.07610941142230473
angry 0.16955844104572973
another 0.03795834299873356
appeal 0.04855303820511528
appearing 0.05740866767735512
appropriate 0.04709871440386009
armed 0.05585710962340024
attention 0.037391109728822425
attraction 0.057081338274241986
audience 0.04223796476767701
away 0.027377838752961885
baby 0.046434566478779804
band 0.13732630886887753
base 0.055290195951019645
become 0.050439682221207795
been 0.018749800514437715
beer 0.06334416790050708
begins 0.03279901279497748
big 0.024685300737589688
bigger 0.051741836658

In [4]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, trainData['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(testData['Label'], prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])

Training time: 9.127353s; Prediction time: 0.892843s
positive:  {'precision': 0.9191919191919192, 'recall': 0.91, 'f1-score': 0.9145728643216081, 'support': 100}
negative:  {'precision': 0.9108910891089109, 'recall': 0.92, 'f1-score': 0.9154228855721394, 'support': 100}


In [5]:
review = """SUPERB, I AM IN LOVE IN THIS PHONE"""
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['pos']


In [6]:
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['pos']


In [7]:
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['pos']


In [8]:
review = """It's not even 5 days since i purchased this product.
I would say this a specially blended worst Phone in all formats.
ISSUE 1:
Have you ever heard of phone which gets drained even in standby mode during night?
Kindly please see the screenshot if you want to believe my statement.
My phone was in full charge at night 10:07 PM . I took this screenshot and went to sleep.
Then I woke up at morning and 6:35 AM and battery got drained by 56% in just standby condition.
If this is the case consider how many hours it will work, during day time.
It's not even 5 hours the battery is able to withstand.
ISSUE 2:
Apart from the battery, the next issue is the heating issue .I purchased a iron box recently from Bajaj in this sale.
But I realized this phone acts a very good Iron box than the Bajaj Iron box. I am using only my headphones to get connected in the call. I am not sure when this phone is will get busted due to this heating issue. It is definitely a challenge to hold this phone for even 1 minute. The heat that the phone is causing will definitely burn your hands and for man if you keep this phone in your pant pocket easily this will lead to infertility for you. Kindly please be aware about that.
Issue 3:
Even some unknown brands has a better touch sensitivity. The touch sensitivity is pathetic, if perform some operation it will easily take 1-2 minutes for the phone to response.
For your kind information my system has 73% of Memory free and the RAM is also 56% free.
Kindly please make this Review famous and lets make everyone aware of this issue with this phone.
Let's save people from buying this phone. There are people who don't even know what to do if this issue happens after 10 days from the date of purchase. So I feel at least this review will help people from purchasing this product in mere future."""
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['neg']
