In [1]:
import sys,nltk,re
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #Removing url in text
        review = re.sub(r"http\S+|www\S+", "", review)
        # 1. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review)
        
        # 2. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 3. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 4. Return a list of words
        return(words)
    
review_data = pd.read_pickle('sentireviews.pkl')

train_i, test_i = train_test_split( np.arange( len( review_data )), train_size = 0.8, random_state = 44 )

train = review_data.ix[train_i]
test = review_data.ix[test_i]


#Pre-processing my traning reviews

clean_train_reviews = []
for review in train['Review']:
    clean_train_reviews.append( " ".join(review_to_wordlist( review )))
    
#Pre-processing my testing reviews  

clean_test_reviews = []
for review in test['Review']:
    clean_test_reviews.append( " ".join(review_to_wordlist( review )))
    
#vectorizing..

vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ), sublinear_tf = True )

train_data_features = vectorizer.fit_transform( clean_train_reviews )
test_data_features = vectorizer.transform( clean_test_reviews )


test



Unnamed: 0,Review,Sentiment
153,I dont think my reviews really matter here. Th...,Positive
469,I got these headphones on 11 sep 2014 in a spa...,Critical
296,i would have given 5 stars if it had a MIC.. o...,Positive
123,Bought for Rs 700 during a promotion. I bought...,Positive
254,I never expected such a amazing headset. I am ...,Positive
712,I bought them in May 2015 and kept them pretty...,Critical
365,This headphone is a bang-for-the-buck product....,Positive
83,"First of all, hats off to Amazon India for pro...",Positive
418,I bought this product purely based on the revi...,Critical
231,I AM WRITING THIS FEED BACK GENUINELY BECAUSE ...,Positive


In [2]:
from sklearn.svm import SVC



svm = SVC(C=1000000.0, gamma=0.0, kernel='rbf')
svm.fit(train_data_features, train["Sentiment"])
pred = svm.predict(test_data_features)

print(svm.score(test_data_features, test["Sentiment"].values))

p={'Review':test['Review'].values,
   'Sentiment':test['Sentiment'].values,
   'Prediction':pred}
predicted_data=pd.DataFrame(p,columns=['Review','Sentiment','Prediction'])
predicted_data

0.833333333333


Unnamed: 0,Review,Sentiment,Prediction
0,I dont think my reviews really matter here. Th...,Positive,Positive
1,I got these headphones on 11 sep 2014 in a spa...,Critical,Positive
2,i would have given 5 stars if it had a MIC.. o...,Positive,Positive
3,Bought for Rs 700 during a promotion. I bought...,Positive,Positive
4,I never expected such a amazing headset. I am ...,Positive,Positive
5,I bought them in May 2015 and kept them pretty...,Critical,Critical
6,This headphone is a bang-for-the-buck product....,Positive,Positive
7,"First of all, hats off to Amazon India for pro...",Positive,Positive
8,I bought this product purely based on the revi...,Critical,Critical
9,I AM WRITING THIS FEED BACK GENUINELY BECAUSE ...,Positive,Positive
