# Data Exploration

In [4]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from string import punctuation

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import pickle

## Test and Train

In [93]:
#Loading Data

train = pd.read_csv('products_sentiment_train.tsv', delimiter='\t')
test = pd.read_csv('products_sentiment_test.tsv', delimiter='\t')

print 'Train shape : ', train.shape
print 'Test shape : ', test.shape

Train shape :  (1999, 2)
Test shape :  (500, 2)


In [94]:
#Naming the columns

train.columns = ['text', 'target']
test.drop('Id', axis=1, inplace=True)

In [95]:
#Check

train.head()

Unnamed: 0,text,target
0,i downloaded a trial version of computer assoc...,1
1,the wrt54g plus the hga7t is a perfect solutio...,1
2,i dont especially like how music files are uns...,0
3,i was using the cheapie pail ... and it worked...,1
4,"you can manage your profile , change the contr...",1


In [96]:
#Check

test.head()

Unnamed: 0,text
0,"so , why the small digital elph , rather than ..."
1,3/4 way through the first disk we played on it...
2,better for the zen micro is outlook compatibil...
3,6 . play gameboy color games on it with goboy .
4,"likewise , i 've heard norton 2004 professiona..."


##  Overview by the Target

In [6]:
print 'Number of texts with the target == 1 : ', train[train['target']==1].shape[0]
print ('The percentage : %0.1f' % (100.*train[train['target']==1].shape[0]/train.shape[0]))
print '\nRandom 5 samples:'
for t in train.loc[train.target==1, ['text']].sample(5).values:
    print t

Number of texts with the target == 1 :  1273
The percentage : 63.7

Random 5 samples:
["while this phone obviously doesn 't have the same quality construction as motorola does , the nokia 6600 is one of the better phones i 've used ."]
["i 've only had it a week , but so far , everything about this camera is making me happy . "]
['it holds plenty of songs .']
['freebies you get : a cradle with detachable stand and belt clip , some thing very handy and useful .']
['- the replacable battery is great since once it eventually wears out ( as all lithium batteries do ) , you will be able to buy another easily . ']


In [7]:
print 'Number of texts with the target == 0 : ', train[train['target']==0].shape[0]
print ('The percentage : %0.1f' % (100.*train[train['target']==0].shape[0]/train.shape[0]))
print '\nRandom 5 samples:'
for t in train.loc[train.target==0, ['text']].sample(5).values:
    print t

Number of texts with the target == 0 :  726
The percentage : 36.3

Random 5 samples:
['and supply those stupid white headphones .']
['the cut-outs for the controls is not thought out as there is too much material in the way to adequately access the controls , especially the scroll wheel . ']
['it would not hang up on calls .']
['because steve jobs is a twisted individual and he made sure that he ruined this device by giving it an unreplacable 18 month battery .']
['controls are a bit awkward . ']


# Preprocessing

In [12]:
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess(text):
    specials = ["’", "‘", "´", "`"]
    contraction_mapping = {"n't": "not", "'t": "not", "'d": "would", "'ll": "will", "'s": "is", 
                       "'ve": "have", "'m": "am", "'re": "are", "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": 
                       "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": 
                       "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
                       "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": 
                       "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                       "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": 
                       "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                       "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", 
                       "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": 
                       "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                       "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": 
                       "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
                       "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": 
                       "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", 
                       "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                       "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", 
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": 
                       "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                       "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": 
                       "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": 
                       "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": 
                       "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  
                       "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", 
                       "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
                       "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": 
                       "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": 
                       "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", 
                       "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": 
                       "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": 
                       "you will have", "you're": "you are", "you've": "you have" }

    mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 
                'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 
                'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 
                'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 
                'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 
                'exboyfriend': 'ex boyfriend', "whst": 'what', 'watsapp': 'whatsapp'}
    punct = [char for char in punctuation if char not in ['?', '!']]

    #Lowering, Removing Digits, Special Characters
    text = re.sub("[\d+-/_\"\(\)]", "", text.lower())

    #Replacing Contractions
    for s in specials:
        text = text.replace(s.decode('utf-8'), "'")
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping 
                                            else t for t in text.split(" ")])
    #Replacing Mispells
    for word in mispell_dict.keys():
        text = text.replace(word, mispell_dict[word])

    #Removing Punctuation and Lemmatizing
    words = text.split()     
    tagged = nltk.pos_tag(words) 
    meaningful_words=[]
    for word, tag in tagged:
        if word in punct:
            continue
        wntag = get_wordnet_pos(tag)
        if wntag is None:
            meaningful_words.append(WordNetLemmatizer().lemmatize(word))
        else:
            meaningful_words.append(WordNetLemmatizer().lemmatize(word, pos=wntag))
    return(" ".join( meaningful_words ))  

In [99]:
train['treated_text'] = train['text'].apply(lambda x: preprocess(x))
test['treated_text'] = test['text'].apply(lambda x: preprocess(x))

## Lowering 

In [8]:
train['lowered_text'] = train['text'].apply(lambda x: x.lower())
test['lowered_text'] = test['text'].apply(lambda x: x.lower())

## Removing Digits and Special Characters

In [9]:
def remove_digits(text):
    return re.sub("[\d+-/_\"]", "", text.lower())

In [10]:
train['treated_text'] = train['lowered_text'].apply(lambda x: remove_digits(x))
test['treated_text'] = test['lowered_text'].apply(lambda x: remove_digits(x))

## Replacing Contractions

In [11]:
contraction_mapping = {"n't": "not", "'t": "not", "'d": "would", "'ll": "will", "'s": "is", 
                       "'ve": "have", "'m": "am", "'re": "are"}

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [12]:
train['treated_text'] = train['treated_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
test['treated_text'] = test['treated_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

## Replacing Mispells

In [13]:
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling'}

def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

In [14]:
train['treated_text'] = train['treated_text'].apply(lambda x: correct_spelling(x, mispell_dict))
test['treated_text'] = test['treated_text'].apply(lambda x: correct_spelling(x, mispell_dict))

## Removing Punctuation and Lemmatizing

In [15]:
punctuation = [char for char in punctuation if char not in ['?', '!']]

def final_prep(text):
    words = text.split()               
    meaningful_words = [WordNetLemmatizer().lemmatize(w) for w in words if w not in punctuation]   
    return(" ".join( meaningful_words ))   

In [16]:
train['treated_text'] = train['treated_text'].apply(lambda x: final_prep(x))
test['treated_text'] = test['treated_text'].apply(lambda x: final_prep(x))

## Checking

In [17]:
for q in train['treated_text'].sample(5):
    print q, '\n'

i find the lack of entertaining game on this phone quite disturbing 

i have ice age and it keep telling me no disc 

the ear bud that come with it look cheap but the sound quality is amazing 

all the good review for the sd are true 

the machine itself seems fine but the software that came with it is awful 



In [18]:
for q in test['treated_text'].sample(5):
    print q, '\n'

i love this camera 

i did have to put a little work into renaming some duplicate file name to get all my music on my zen xtra but it wa not a big problem 

pro large hard drive for the gb and gb are both affordable 

compared to musicmatch the software ha a better filing system and easier to use 

there seem to be fewer collisons and dropped packet a i read from the router log than with my old dlink router 



# Classifier Selection

## CountVec + Logistic Regression

In [19]:
count_lr_pipe = Pipeline([
    ("vectorizer", CountVectorizer(analyzer='word', ngram_range=(1,3))),
    ("classifier", LogisticRegression(class_weight='balanced'))])

In [20]:
count_lr_cv = cross_val_score(count_lr_pipe, train['treated_text'], train['target'], scoring='accuracy', cv=5)
print count_lr_cv.mean(), count_lr_cv.std()



0.7878966931043319 0.006490023507407363


## Tf-Idf + Logistic Regression

In [21]:
tfidf_lr_pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer='word', ngram_range=(1,3))),
    ("classifier", LogisticRegression(class_weight='balanced'))])

In [22]:
tfidf_lr_cv = cross_val_score(tfidf_lr_pipe, train['treated_text'], train['target'], scoring='accuracy', cv=5)
print tfidf_lr_cv.mean(), tfidf_lr_cv.std()

0.7863891868074175 0.005966513656341954


## Tf-Idf + Logistic Regression + StopWords

In [23]:
stops = set(stopwords.words("english"))  

tfidf_lr_s_pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words=stops)),
    ("classifier", LogisticRegression(class_weight='balanced'))])

In [24]:
tfidf_lr_s_cv = cross_val_score(tfidf_lr_s_pipe, train['treated_text'], train['target'], scoring='accuracy', cv=5)
print tfidf_lr_s_cv.mean(), tfidf_lr_s_cv.std()

0.7513515365721036 0.016547232790117102


## Tf-Idf + LinearSVC

In [25]:
tfidf_svc_pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer='word', ngram_range=(1,3))),
    ("classifier", LinearSVC(C=2.3))])

In [26]:
tfidf_svc_cv = cross_val_score(tfidf_svc_pipe, train['treated_text'], train['target'], scoring='accuracy', cv=5)
print tfidf_svc_cv.mean(), tfidf_svc_cv.std()

0.7928917305733161 0.007446575777154651


##  Tf-Idf + Random Forest

In [27]:
tfidf_rf_pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer='word', ngram_range=(1,3))),
    ("classifier", RandomForestClassifier(n_estimators = 50, class_weight='balanced'))])

In [28]:
tfidf_rf_cv = cross_val_score(tfidf_rf_pipe, train['treated_text'], train['target'], scoring='accuracy', cv=5)
print tfidf_rf_cv.mean(), tfidf_rf_cv.std()

0.7413602647516548 0.011694874588635312


# Predictions

In [100]:
ans = pd.read_csv('products_sentiment_sample_submission.csv')

In [101]:
model = tfidf_svc_pipe.fit(train['treated_text'], train['target'])
ans['y'] = model.predict(test['treated_text'])

In [102]:
ans.to_csv('answer.csv', index=False)

## Saving a model

In [34]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3))
vectorizer = vectorizer.fit(train['treated_text'], train['target'])

In [35]:
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

In [37]:
model = LinearSVC(C=2.3)
model = model.fit(vectorizer.transform(train['treated_text']), train['target'])

In [38]:
pickle.dump(model, open('model.pkl', 'wb'))

# More data

In [13]:
from nltk.corpus import movie_reviews
 
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [14]:
mv_data = pd.DataFrame(
    {'text': texts,
     'target': labels
    })

In [15]:
mv_data['treated_text'] = mv_data['text'].apply(lambda x: preprocess(x))

In [16]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3))
vectorizer = vectorizer.fit(mv_data['treated_text'], mv_data['target'])

In [19]:
model = LogisticRegression(C=2.3)
model = model.fit(vectorizer.transform(mv_data['treated_text']), mv_data['target'])



In [20]:
pickle.dump(vectorizer, open('vectorizer_mv.pkl', 'wb'))
pickle.dump(model, open('model_mv.pkl', 'wb'))

In [21]:
res = cross_val_score(model,vectorizer.transform(mv_data['treated_text']), mv_data['target'], scoring='accuracy', cv=5)

array([0.7975, 0.7925, 0.8175, 0.8275, 0.8275])

In [25]:
print 'Number of texts with the target == 1 : ', mv_data[mv_data['target']==0].shape[0]
print ('The percentage : %0.1f' % (100.*mv_data[mv_data['target']==0].shape[0]/mv_data.shape[0]))
print '\nRandom 5 samples:'
for t in mv_data.loc[mv_data.target==1, ['text']].sample(5).values:
    print t

Number of texts with the target == 1 :  1000
The percentage : 50.0

Random 5 samples:
[u'assume nothing . the phrase is perhaps one of the most used of the 1990 \' s , as first impressions and rumors are hardly ever what they seem to be . the phrase especially goes for oscar novak , an architect who is the main focus of three to tango , a delightful , funny romantic comedy about assumptions and being yourself . novak ( matthew perry ) , a shy , clumsy , chicago based architect , along with openly gay partner , peter steinberg ( oliver platt ) , fights for projects day in and day out . one of these is the job of restoring a popular building for charles newman ( dylan mcdermott ) , a rich , well - known businessman . charles immediately takes a liking to oscar , as he enjoys his personality and sense of humor . seeing oscar as someone he could trust , charles asks him to watch his girlfriend , an unpredictable , adventurous girl named amy post ( neve campbell ) , who makes a living by bl