In [1]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings("ignore")
import spacy
import re
import string
import pickle

In [2]:
reviews ={"reviews":[ " I can't believe I wasted my money on this keyboard and mouse",
            "I really love this device",
            "It's quite good, but not so great. They can do better",
            "I hate this device, terrible!!!",
            "Where the hell is my refund for this damn product (mouse)",
            "I believe Ebay is so much better than the crap that is HERE!!!",
            "worst keyboard ever",
            "distasteful to the fullest!!!",
            "spoilt on the first day",
            "never shopping here again, waste of money"]}

In [3]:
# https://stackoverflow.com/questions/12851791/removing-numbers-from-string
def preprocess(sent):
    '''Cleans text data up, leaving only 2 or
        more char long non-stepwords composed of A-Z & a-z only
        in lowercase'''
    # lowercase
    sentence = sent.lower()

    # Remove RT
    sentence = re.sub('RT @\w+: '," ",sentence)

    # Remove special characters
    sentence = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", sentence)

    # Removing digits
    sentence = sentence.translate(str.maketrans('', '', string.digits))

    # Removing puntuactions
    # sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  
    # When we remove apostrophe from the word "Mark's", 
    # the apostrophe is replaced by an empty space. 
    # Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  
    # Next, we remove all the single characters and replace it by a space 
    # which creates multiple spaces in our text. 
    # Finally, we remove the multiple spaces from our text as well.

    return sentence


In [4]:
reviews = pd.DataFrame(reviews)

In [5]:
reviews

Unnamed: 0,reviews
0,I can't believe I wasted my money on this key...
1,I really love this device
2,"It's quite good, but not so great. They can do..."
3,"I hate this device, terrible!!!"
4,Where the hell is my refund for this damn prod...
5,I believe Ebay is so much better than the crap...
6,worst keyboard ever
7,distasteful to the fullest!!!
8,spoilt on the first day
9,"never shopping here again, waste of money"


In [6]:
# Removing several stop words
all_stopwords = {"'d","'ll","'m","'re","'s","'ve",'a','about',
'above','across','after','afterwards','again','all','almost','alone','along',
'already','also','although','always','am','among','amongst','amount','an','and',
'another','any','anyhow','anyone','anything','anyway','anywhere','are','around',
'as','at','back','be','became','because','become','becomes','becoming','been','before',
'beforehand','behind','being','below','beside','besides','between','both','bottom',
'but','by','ca','call','can','could','did','do','does','doing','done','down','due','during','each',
'eight','either','eleven','else','elsewhere','empty','even','everyone','everything',
'everywhere','except','few','fifteen','fifty','first','five','for','former','formerly','forty','four','from','front',
'full','further','go','had','has','have','he','hence','her','here','hereafter','hereby','herein','hereupon','hers',
'herself','him','himself','his','how','however','hundred','i','if','in','indeed','into','is','it','its','itself','just','keep','last',
'latter','latterly','made','make','many','may','me','meanwhile','might','mine','more','moreover','move','much',
'must','my','myself','name','namely','neither','nevertheless','next','nine','nobody','noone','nothing','now','nowhere','of','often',
'on','once','one','only','onto','or','other','others','otherwise','our','ours','ourselves','out','own','part','per','perhaps','please','put',
'rather','re','regarding','same','say','see','several','she','should','show','side',
'since','six','sixty','so','some','somehow','someone','something','sometime','sometimes','somewhere','still','such','take','ten','than','that','the','their',
'them','themselves','then','thence','there','thereafter','thereby','therefore','therein','thereupon','these','they','third','this','those','though','three',
'through','throughout','thru','thus','to','together','top','toward','towards','twelve','twenty','two','under','unless','until','up','upon','us','used','using',
'various','via','was','we','well','were','what','whatever','when','whence','whenever','where','whereafter','whereas','whereby','wherein','whereupon',
'wherever','whether','which','while','whither','who','whoever','whole','whom','whose','why','will','with','within','would','yet','you','your','yours','yourself',
'yourselves','‘d','‘ll','‘m','‘re','‘s','‘ve','’d','’ll','’m','’re','’s','’ve'}
my_stop_words = set(all_stopwords) # My own stop words

In [7]:
reviews["cleaned"] = reviews["reviews"].apply(preprocess)

In [8]:
reviews

Unnamed: 0,reviews,cleaned
0,I can't believe I wasted my money on this key...,can believe wasted my money on this keyboard ...
1,I really love this device,i really love this device
2,"It's quite good, but not so great. They can do...",it quite good but not so great they can do better
3,"I hate this device, terrible!!!",i hate this device terrible
4,Where the hell is my refund for this damn prod...,where the hell is my refund for this damn prod...
5,I believe Ebay is so much better than the crap...,i believe ebay is so much better than the crap...
6,worst keyboard ever,worst keyboard ever
7,distasteful to the fullest!!!,distasteful to the fullest
8,spoilt on the first day,spoilt on the first day
9,"never shopping here again, waste of money",never shopping here again waste of money


In [9]:
nlp = spacy.load("en_core_web_sm")
def spacy_tokeniser(sent):
    sent = sent.strip().lower()
    doc = nlp(sent)
    mytokens = [token.lemma_ for token in doc if token.text not in my_stop_words]
    return mytokens

In [10]:
reviews['tokens']= reviews['cleaned'].apply(spacy_tokeniser)

In [11]:
reviews

Unnamed: 0,reviews,cleaned,tokens
0,I can't believe I wasted my money on this key...,can believe wasted my money on this keyboard ...,"[believe, waste, money, keyboard, mouse]"
1,I really love this device,i really love this device,"[really, love, device]"
2,"It's quite good, but not so great. They can do...",it quite good but not so great they can do better,"[quite, good, not, great, well]"
3,"I hate this device, terrible!!!",i hate this device terrible,"[hate, device, terrible]"
4,Where the hell is my refund for this damn prod...,where the hell is my refund for this damn prod...,"[hell, refund, damn, product, mouse]"
5,I believe Ebay is so much better than the crap...,i believe ebay is so much better than the crap...,"[believe, ebay, well, crap]"
6,worst keyboard ever,worst keyboard ever,"[bad, keyboard, ever]"
7,distasteful to the fullest!!!,distasteful to the fullest,"[distasteful, full]"
8,spoilt on the first day,spoilt on the first day,"[spoilt, day]"
9,"never shopping here again, waste of money",never shopping here again waste of money,"[never, shop, waste, money]"


In [12]:
vectorizer = pickle.load(open('pv_dbow_200_model.pkl', "rb"))
model = pickle.load(open('doc2vec_xgb_200.pkl','rb'))

In [13]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# https://github.com/RaRe-Technologies/movie-plots-by-genre/blob/master/ipynb_with_output/Document%20classification%20with%20word%20embeddings%20tutorial%20-%20with%20output.ipynb
def vec_for_learning(model, tagged_docs):
    sents = reviews['tokens']
    X = [model.infer_vector(doc) for doc in sents]
    return X

In [14]:
X_vectors = vec_for_learning(vectorizer, reviews['tokens'])

In [15]:
y_pred = model.predict(X_vectors)

In [16]:
reviews["predictions"] = y_pred.tolist()
reviews

Unnamed: 0,reviews,cleaned,tokens,predictions
0,I can't believe I wasted my money on this key...,can believe wasted my money on this keyboard ...,"[believe, waste, money, keyboard, mouse]",0
1,I really love this device,i really love this device,"[really, love, device]",1
2,"It's quite good, but not so great. They can do...",it quite good but not so great they can do better,"[quite, good, not, great, well]",1
3,"I hate this device, terrible!!!",i hate this device terrible,"[hate, device, terrible]",0
4,Where the hell is my refund for this damn prod...,where the hell is my refund for this damn prod...,"[hell, refund, damn, product, mouse]",0
5,I believe Ebay is so much better than the crap...,i believe ebay is so much better than the crap...,"[believe, ebay, well, crap]",0
6,worst keyboard ever,worst keyboard ever,"[bad, keyboard, ever]",0
7,distasteful to the fullest!!!,distasteful to the fullest,"[distasteful, full]",0
8,spoilt on the first day,spoilt on the first day,"[spoilt, day]",0
9,"never shopping here again, waste of money",never shopping here again waste of money,"[never, shop, waste, money]",0
