<a href="https://colab.research.google.com/github/duybluemind1988/Data-science/blob/master/NLP/Kaggle_movie_sentiment/Movies_review_sentiment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 31 20:05:23 2017
@author: DIP
@Copyright: Dipanjan Sarkar
"""

from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.metrics import roc_curve, auc 


def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        4))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
                        

def train_predict_model(classifier, 
                        train_features, train_labels, 
                        test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions    


def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  labels=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                labels=level_labels)) 
    print(cm_frame) 


def display_confusion_matrix_pretty(true_labels, predicted_labels, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  labels=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                labels=level_labels)) 
    return cm_frame
    
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    print(report)
    
    
    
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, 
                                  classes=classes)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, 
                             classes=classes)


def plot_model_decision_surface(clf, train_features, train_labels,
                                plot_step=0.02, cmap=plt.cm.RdYlBu,
                                markers=None, alphas=None, colors=None):
    
    if train_features.shape[1] != 2:
        raise ValueError("X_train should have exactly 2 columnns!")
    
    x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
    y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    clf_est = clone(clf)
    clf_est.fit(train_features,train_labels)
    if hasattr(clf_est, 'predict_proba'):
        Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])    
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=cmap)
    
    le = LabelEncoder()
    y_enc = le.fit_transform(train_labels)
    n_classes = len(le.classes_)
    plot_colors = ''.join(colors) if colors else [None] * n_classes
    label_names = le.classes_
    markers = markers if markers else [None] * n_classes
    alphas = alphas if alphas else [None] * n_classes
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y_enc == i)
        plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
                    label=label_names[i], cmap=cmap, edgecolors='black', 
                    marker=markers[i], alpha=alphas[i])
    plt.legend()
    plt.show()


def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None):
    
    ## Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    if hasattr(clf, 'classes_'):
        class_labels = clf.classes_
    elif label_encoder:
        class_labels = label_encoder.classes_
    elif class_names:
        class_labels = class_names
    else:
        raise ValueError('Unable to derive prediction classes, please specify class_names!')
    n_classes = len(class_labels)
    y_test = label_binarize(true_labels, classes=class_labels)
    if n_classes == 2:
        if hasattr(clf, 'predict_proba'):
            prob = clf.predict_proba(features)
            y_score = prob[:, prob.shape[1]-1] 
        elif hasattr(clf, 'decision_function'):
            prob = clf.decision_function(features)
            y_score = prob[:, prob.shape[1]-1]
        else:
            raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
        
        fpr, tpr, _ = roc_curve(y_test, y_score)      
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'
                                 ''.format(roc_auc),
                 linewidth=2.5)
        
    elif n_classes > 2:
        if hasattr(clf, 'predict_proba'):
            y_score = clf.predict_proba(features)
        elif hasattr(clf, 'decision_function'):
            y_score = clf.decision_function(features)
        else:
            raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")

        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        ## Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        ## Compute macro-average ROC curve and ROC area
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        # Finally average it and compute AUC
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        ## Plot ROC curves
        plt.figure(figsize=(6, 4))
        plt.plot(fpr["micro"], tpr["micro"],
                 label='micro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["micro"]), linewidth=3)

        plt.plot(fpr["macro"], tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]), linewidth=3)

        for i, label in enumerate(class_labels):
            plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                           ''.format(label, roc_auc[i]), 
                     linewidth=2, linestyle=':')
    else:
        raise ValueError('Number of classes should be atleast 2 or more')
        
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [7]:
train_path='/content/drive/My Drive/Data/NLP/Kaggle_movie_review_sentiment/train.tsv'
test_path='/content/drive/My Drive/Data/NLP/Kaggle_movie_review_sentiment/test.tsv'

In [8]:
import pandas as pd
import numpy as np

In [9]:
train_df=pd.read_csv(train_path,sep='\t')
print(train_df.shape)
train_df.head()

(156060, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [10]:
train_df['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [11]:
X=train_df['Phrase']
y=train_df['Sentiment']

In [12]:
X

0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
                                ...                        
156055                                            Hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [13]:
y

0         1
1         2
2         2
3         2
4         2
         ..
156055    2
156056    1
156057    3
156058    2
156059    2
Name: Sentiment, Length: 156060, dtype: int64

In [14]:
test_df=pd.read_csv(test_path,sep='\t')
print(test_df.shape)
test_df.head()

(66292, 3)


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


Text have no label, so we split training set to train and text set

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,
                                               stratify=y,random_state=42)

In [17]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(109242,)
(109242,)
(46818,)
(46818,)


In [18]:
y_train.value_counts(normalize=True)

2    0.509950
3    0.210990
1    0.174759
4    0.058988
0    0.045312
Name: Sentiment, dtype: float64

In [19]:
y_test.value_counts(normalize=True)

2    0.509932
3    0.210987
1    0.174762
4    0.058994
0    0.045324
Name: Sentiment, dtype: float64

# Text Classification - I

In [None]:
#!pip install text_normalizer

In [1]:
!pip install demoji

Collecting demoji
  Downloading https://files.pythonhosted.org/packages/da/0b/d008f26ebbfd86d21117267e627f2f7359c76e5ecbeba08d8f631f4092c4/demoji-0.2.1-py2.py3-none-any.whl
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Installing collected packages: colorama, demoji
Successfully installed colorama-0.4.3 demoji-0.2.1


In [2]:
'''
Contains helper funtions for preprocessing of twitter documents before getting their corresponding vectors
from word2vec_twitter model
Author: Anuj Gupta
'''

'''
functionality:
Discard non-english tweets
Discard Replies
Discard RTs
For now this is handle while fetch tweet text from mongo documents
process constantbrands mentions
process any other mentions
process constant brand name if any 
process hanstags
process URLs
process websites
process process_EmailIds
process 
process alphanums
'''

import re
import string
import demoji
demoji.download_codes()
from nltk.tokenize import TweetTokenizer

#gobal
PunctChars = r'''[`'“".?!,:;]'''
Punct = '%s+' % PunctChars
Entity = '&(amp|lt|gt|quot);'
printable = set(string.printable)

# helper functoins
def regex_or(*items):
	r = '|'.join(items)
	r = '(' + r + ')'
	return r

def pos_lookahead(r):
	return '(?=' + r + ')'

def neg_lookahead(r):
	return '(?!' + r + ')'

def optional(r):
	return '(%s)?' % r

def trim(transient_tweet_text):
	''' 
	trim leading and trailing spaces in the tweet text
	'''
	return transient_tweet_text.strip()

def strip_whiteSpaces(transient_tweet_text):
	'''
	Strip all white spaces
	'''
	transient_tweet_text = re.sub(r'[\s]+', ' ', transient_tweet_text)
	return transient_tweet_text

def to_LowerCase(transient_tweet_text):
	'''
	Convert tweet text to lower to lower case alphabets
	'''
	transient_tweet_text = transient_tweet_text.lower()
	return transient_tweet_text

def prune_multple_consecutive_same_char(transient_tweet_text):
	'''
	yesssssssss  is converted to yess 
	ssssssssssh is converted to ssh
	'''
	transient_tweet_text = re.sub(r'(.)\1+', r'\1\1', transient_tweet_text)
	return transient_tweet_text

def remove_spl_words(transient_tweet_text):
	transient_tweet_text = transient_tweet_text.replace('&amp;',' and ')

	return transient_tweet_text

def strip_unicode(transient_tweet_text):
    '''
    Strip all unicode characters from a tweet
    '''
    tweet = ''.join(i for i in transient_tweet_text if ord(i)<128)
    return tweet 

def process_URLs(transient_tweet_text):
	'''
	replace all URLs in the tweet text
	'''
	UrlStart1 = regex_or('https?://', r'www\.',r'bit.ly/')
	CommonTLDs = regex_or('com','co\\.uk','org','net','info','ca','biz','info','edu','in','au')
	UrlStart2 = r'[a-z0-9\.-]+?' + r'\.' + CommonTLDs + pos_lookahead(r'[/ \W\b]')
	UrlBody = r'[^ \t\r\n<>]*?'  # * not + for case of:  "go to bla.com." -- don't want period
	UrlExtraCrapBeforeEnd = '%s+?' % regex_or(PunctChars, Entity)
	UrlEnd = regex_or( r'\.\.+', r'[<>]', r'\s', '$')
	Url = 	(optional(r'\b') + 
    		regex_or(UrlStart1, UrlStart2) + 
    		UrlBody + 
    pos_lookahead( optional(UrlExtraCrapBeforeEnd) + UrlEnd))

	Url_RE = re.compile("(%s)" % Url, re.U|re.I)
	transient_tweet_text = re.sub(Url_RE, " constanturl ", transient_tweet_text)

	# fix to handle unicodes in URL
	URL_regex2 = r'\b(htt)[p\:\/]*([\\x\\u][a-z0-9]*)*'
	transient_tweet_text = re.sub(URL_regex2, " constanturl ", transient_tweet_text)
	return transient_tweet_text

def process_Websites(transient_tweet_text):
	'''
	identify website mentioned if any 
	'''
	CommonTLDs = regex_or('com','co\\.uk','org','net','info','ca','biz','info','edu','in','au')
	sep = r'[.]'
	website_regex = r'(?<!#)?(\b)?[a-zA-Z0-9.]+' + sep + CommonTLDs
	website_RE = re.compile("(%s)" % website_regex, re.U|re.I)
	transient_tweet_text = re.sub(website_RE, ' constantwebsite ', transient_tweet_text)

	return transient_tweet_text

def process_EmailIds(transient_tweet_text):
	'''
	identify email mentioned if any
	'''
	email_regex = r'(\b)?[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(\b)?'
	transient_tweet_text = re.sub(email_regex, ' constantemailid ', transient_tweet_text)

	return transient_tweet_text

def process_Mentions(transient_tweet_text):
	'''
	Identify mentions if any
	'''
	transient_tweet_text = re.sub(r"@(\w+)", " constantnonbrandmention ", transient_tweet_text)
	return transient_tweet_text
def process_HashTags(transient_tweet_text):
	'''
	Strip all Hashtags from a tweet
	'''
	transient_tweet_text = re.sub(r"#(\w+)\b", ' constanthashtag ', transient_tweet_text)
	return transient_tweet_text

def process_Dates(transient_tweet_text):
	'''
	Identify date and convert it to constant
	'''
	#transient_tweet_text = re.sub(r'(\d+/\d+/\d+)', " constantdate " , transient_tweet_text)
	#transient_tweet_text = re.sub(r'constantnum[\s]?(/|-)[\s]?constantnum[\s]?(/|-)[\s]?constantnum', " constantdate " , transient_tweet_text)
	#date_regex = r'(constantnum)[\s]*(st|nd|rd|th)[\s]*(january|jan|february|feb|march|mar|april|may|june|jun|july|august|aug|september|sep|october|oct|november|nov|december|dec)'
	date_regex1 = r'\b((0|1|2|3)?[0-9][\s]*)[-./]([\s]*([012]?[0-9])[\s]*)([-./]([\s]*(19|20)[0-9][0-9]))?\b'
	transient_tweet_text = re.sub(date_regex1, ' constantdate ' , transient_tweet_text)
	date_regex2 = r'\b((19|20)[0-9][0-9][\s]*[-./]?)?[\s]*([012]?[0-9])[\s]*[-./][\s]*(0|1|2|3)?[0-9]\b'
	transient_tweet_text = re.sub(date_regex2, ' constantdate ' , transient_tweet_text)

	Months = regex_or('january','jan','february','feb','march','mar','april','may','june','jun','july','jul','august','aug','september','sep','october','oct','november','nov','december','dec')
	date_regex3 = r'\d+[\s]*(st|nd|rd|th)[\s]*' + Months 
	transient_tweet_text = re.sub(date_regex3, ' constantdate ', transient_tweet_text)
	date_regex4 = Months + r'[\s]*\d+[\s]*(st|nd|rd|th)*\b'
	transient_tweet_text = re.sub(date_regex4, ' constantdate ' , transient_tweet_text)
	date_regex5 = r'[\s]?\b(19|20)[0-9][0-9]\b[\s]?'
	transient_tweet_text = re.sub(date_regex5, ' constantdate ' , transient_tweet_text)

	date_regex6 = r'([\s]*(constantdate))+'
	transient_tweet_text = re.sub(date_regex6, ' constantdate ' , transient_tweet_text)

	return transient_tweet_text

def process_Times(transient_tweet_text):
	'''
	Indentify time and convert it to constant
	'''
	time_regex1 = r'([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9][\s]*(am|pm)?[\s]*([iescm](st)|gmt|utc|[pmce](dt))?'
	transient_tweet_text = re.sub(time_regex1, ' constanttime ' , transient_tweet_text)

	return transient_tweet_text

def process_BrandMentions(transient_tweet_text):
	'''
	process all airrwoot brands Mentions if any in tweet text
	'''
	constant_brands = regex_or('paytmcare', 'paytm','snapdeal_help','snapdeal','bluestone_com','shopohelp','shopo','mobikwikswat',
		'mobikwik','taxiforsure','tfscares','zoomcarindia','freshmenuindia','freshmenucares','grofers','jetairways',
		'cleartrip','olacabs','support_ola','makemytrip','makemytripcare','chai_point')

	BrandMentionRegex = r'@(\b)*' + constant_brands
	transient_tweet_text = re.sub(BrandMentionRegex, ' constantbrandmention ', transient_tweet_text)
	return transient_tweet_text

def process_NonBrandMentions(transient_tweet_text):
	'''
	process all Mentions left, if any, in tweet text
	'''
	transient_tweet_text = re.sub(r"@(\w+)", ' constantnonbrandmention ', transient_tweet_text)
	return transient_tweet_text

def process_BrandName(transient_tweet_text):
	'''
	process all airrwoot brands Mentions if any in tweet text
	'''
	constant_brands = regex_or('paytmcare', 'paytm','snapdeal_help','snapdeal','bluestone_com', 'bluestone','shopohelp','shopo',
		'mobikwikswat','mobikwik','taxiforsure','tfscares', 'tfs','zoomcarindia', 'zoomcar','freshmenuindia','freshmenucares', 
		'freshmenu','grofers','jetairways', 'jet','cleartrip','olacabs','support_ola', 'olacab', 'ola' ,'makemytrip',
		'makemytripcare', 'chai_point')

	BrandNameRegex = constant_brands
	transient_tweet_text = re.sub(BrandNameRegex, ' constantbrandname ', transient_tweet_text)
	return transient_tweet_text

def identify_Savings(transient_tweet_text):
	'''
	identify sale/save offers
	'''
	#sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*(constantnum)*[\s]*[%]?[\s]*(-|~)?[\s]*(constantnum)*[\s]*[%]?'
	sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(-|~|or)?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?'
	transient_tweet_text = re.sub(sale_regex, " constantdiscount ", transient_tweet_text)
	#discount_List = []
	#discount_List = re.findall(r'constantdiscount', transient_tweet_text)
	return transient_tweet_text

def indentify_Offers(transient_tweet_text):
	'''
	identify cashbacks and off / substrings of the form "30% off" or "30% cashback" or "$30 off"
	Replace them by constantOFFER
	'''
	#transient_tweet_text = re.sub(r'[rs|$]?[ ]*[constantnum][ ]*[%]?[ ]?[off|cashback|offer]', "constantoffer", transient_tweet_text)
	transient_tweet_text = re.sub(r'(?<!#)\b(?:(up[\s]?to)?((rs|\$)*[\s]*(constantnum))[\s]*[%]?)?[\s]*[-|~|or]?[\.]?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(offer|off|cashback|cash|cash back)', " constantoffer ", transient_tweet_text)
	transient_tweet_text = re.sub(r'(?<!#)\b(?:cashback|cash back|cash)\b', " constantoffer ", transient_tweet_text)
	#Offer_List = []
	#Offer_List = re.findall(r'constantoffer', transient_tweet_text)
	return transient_tweet_text

def indentify_Promos(transient_tweet_text):
	'''
	indentify coupons/promos with promo codes
	Assumption - promo code can be alphanumeric. But it immediately follows text of promo/code/promocode etc
	'''
	#transient_tweet_text = re.sub(r'\b(promocode|promo code|promo|code)[s]?[\s]*[a-z]*(constantnum)*[a-z]*[\s]+', " constantpromo ", transient_tweet_text)
	transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*(constantalphanum)\b', " constantpromo ", transient_tweet_text)
	transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[a-z]+\b', " constantpromo ", transient_tweet_text)
	transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[0-9]+\b', " constantpromo ", transient_tweet_text)
	transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code|coupon)[s]?\b', " constantpromo ", transient_tweet_text)
	#Promo_List = []
	#Promo_List = re.findall(r'constantpromo', transient_tweet_text)
	return transient_tweet_text

def indentify_Money(transient_tweet_text):
	'''
	identify money in the tweet text but outside offers. This includes $,Rs, pound, Euro
	'''
	money_regex1 = r'\b(rs|\$)[\s]*(constantnum)?[\.]?[\s]*constantnum\b'
	transient_tweet_text = re.sub(money_regex1, " constantmoney ", transient_tweet_text)
	money_regex2 = r'[\s]*[\.]?[\s]*constantnum(cent[s]?|\$|c)\b'
	transient_tweet_text = re.sub(money_regex2, " constantmoney ", transient_tweet_text)
	money_regex3 = r'(\$|rs)[\s]*constantalphanum'
	transient_tweet_text = re.sub(money_regex3, " constantmoney ", transient_tweet_text)
	#Money_List = []
	#Money_List = re.findall(r'constantmoney', transient_tweet_text)
	return transient_tweet_text

def indentify_freebies(transient_tweet_text):
	'''
	indentify freebies in tweets if any - free offers, free shipping, free trial, 
	'''
	freebies_regex1 = r'(?<!#)\b(?:free)[\s]+[a-z]+\b'
	transient_tweet_text = re.sub(freebies_regex1, " constantfreebies ", transient_tweet_text)
	freebies_regex2 = r'(?<!#)\b(?:free)[\s]+[a-z]*\b'
	transient_tweet_text = re.sub(freebies_regex2, " constantfreebies ", transient_tweet_text)
	return transient_tweet_text

def replace_numbers(transient_tweet_text):
	'''
	Given any number/interger in tweet text, we want it to be replaced by constantnum
	'''
	# we want to process only those numbers that are not in a hashtag - below logic does this
	num_regex = r'(?<!#)\b(?:[-+]?[\d,]*[\.]?[\d,]*[\d]+|\d+)\b'
	transient_tweet_text = re.sub(num_regex, " constantnum " , transient_tweet_text)
	return transient_tweet_text

def identify_AlphaNumerics(transient_tweet_text):
	'''
	Identify alpha numerics - this helps in identifying product codes/models, promocodes, Order IDs
	'''
	AlphaNumeric_regex = r'(?<!#)\b(?:([a-z]+[0-9]+[a-z]*|[a-z]*[0-9]+[a-z]+)[a-z,0-9]*)\b'
	transient_tweet_text = re.sub(AlphaNumeric_regex, " constantalphanum ", transient_tweet_text)
	return transient_tweet_text

def strip_whiteSpaces(transient_tweet_text):
	'''
	Strip all white spaces
	'''
	transient_tweet_text = re.sub(r'[\s]+', ' ', transient_tweet_text)
	return transient_tweet_text

def prune_multple_consecutive_same_char(transient_tweet_text):
	'''
	yesssssssss  is converted to yess 
	ssssssssssh is converted to ssh
	'''
	transient_tweet_text = re.sub(r'(.)\1+', r'\1\1', transient_tweet_text)
	return transient_tweet_text

def remove_spl_words(transient_tweet_text):
	transient_tweet_text = transient_tweet_text.replace('&amp;',' and ')

	return transient_tweet_text

def remove_emoji(transient_tweet_text):
    tweet_tokenizer = TweetTokenizer()
    tokenized_tweet = tweet_tokenizer.tokenize(transient_tweet_text)
    emojis_present = demoji.findall(transient_tweet_text)
    tweet_no_emoji=''
    for i,s in enumerate(tokenized_tweet):
        if s in emojis_present.keys():
            tweet_no_emoji = tweet_no_emoji + ' ' + emojis_present[s]
        else:
            tweet_no_emoji = tweet_no_emoji + ' ' + s
    return tweet_no_emoji

def deEmojify(transient_tweet_text):
    return transient_tweet_text.encode('ascii', 'ignore').decode('ascii')

# def print_test():
    
#     test_tweet = "Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM  grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"
#     #General Proprocessing
#     test_tweet = test_tweet.lower()
#     test_tweet = strip_unicode(test_tweet)
    
#     #function tests
#     print("Process URLS:\n",process_URLs(test_tweet))
#     print("Remove websites:\n",process_Websites(test_tweet))
#     print("Remove mentions:\n",process_Mentions(test_tweet))
#     print('Remove Emailid:\n',process_EmailIds(test_tweet))
#     print("Remove Hashtags:\n",process_HashTags(test_tweet))
#     print("Remove Dates:\n",process_Dates(test_tweet))
#     print("Process Time:\n",process_Times(test_tweet))
#     print("Process Brand Mention:\n",process_BrandMentions(test_tweet))
#     print("Process non Brand Mention:\n",process_NonBrandMentions(test_tweet))
#     print("Process Brand Name:\n",process_BrandName(test_tweet))
#     print("Process Savings:\n",identify_Savings(test_tweet))
#     print("Process Offers:\n",indentify_Offers(test_tweet))
#     print("Identiy Promos:\n",indentify_Promos(test_tweet))
# ############
# print_test()




def process_TweetText(tweet_text):
	'''
	Takes tweet_text and preprocesses it 
	Order of preprocessing:
	'''

	# get utf-8 encoding, lowercase, trim and remove multiple white spaces
	transient_tweet_text = tweet_text
	transient_tweet_text = strip_unicode(transient_tweet_text)
	#print "PROCESSED: ", transient_tweet_text

	transient_tweet_text = to_LowerCase(transient_tweet_text)
	transient_tweet_text = trim(transient_tweet_text)
	transient_tweet_text = strip_whiteSpaces(transient_tweet_text)
	transient_tweet_text = remove_spl_words(transient_tweet_text)

 
	#emoji
	transient_tweet_text = remove_emoji(transient_tweet_text) 
	transient_tweet_text = deEmojify(transient_tweet_text)
	# process Hastags, URLs, Websites, process_EmailIds
	# Give precedence to url over hashtag
	transient_tweet_text = process_URLs(transient_tweet_text)
	transient_tweet_text = process_HashTags(transient_tweet_text)
	#transient_tweet_text = process_Websites(transient_tweet_text)
	transient_tweet_text = process_EmailIds(transient_tweet_text)

	# process for brand mention, any other mention and brand Name
	#transient_tweet_text = process_BrandMentions(transient_tweet_text)
	#transient_tweet_text = process_NonBrandMentions(transient_tweet_text)
	transient_tweet_text = process_Mentions(transient_tweet_text)
	#transient_tweet_text = process_BrandName(transient_tweet_text)

	# remove any unicodes
	transient_tweet_text = strip_unicode(transient_tweet_text)

	# identify Date / Time if any
	transient_tweet_text = process_Times(transient_tweet_text)
	transient_tweet_text = process_Dates(transient_tweet_text)

	# indentify alphanums and nums
	transient_tweet_text = identify_AlphaNumerics(transient_tweet_text)
	transient_tweet_text = replace_numbers(transient_tweet_text)
	
	# identify promos, savings, offers, money and freebies
	transient_tweet_text = indentify_Promos(transient_tweet_text)
	transient_tweet_text = identify_Savings(transient_tweet_text)
	transient_tweet_text = indentify_Offers(transient_tweet_text)
	transient_tweet_text = indentify_Money(transient_tweet_text)
	transient_tweet_text = indentify_freebies(transient_tweet_text)

	transient_tweet_text = trim(transient_tweet_text)
	transient_tweet_text = strip_whiteSpaces(transient_tweet_text)

	transient_tweet_text = prune_multple_consecutive_same_char(transient_tweet_text)

	return transient_tweet_text

# if __name__ == "__main__":
# print(process_TweetText("Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM  grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"))

Downloading emoji data ...
... OK (Got response in 0.37 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [3]:
#Making the necessary imports
import os
import sys

#preprocessing_path = "/home/etherealenvy/Downloads/practical-nlp/Ch8/O5_smtd_preprocessing.py"
#sys.path.append(os.path.abspath(preprocessing_path))

#import O5_smtd_preprocessing

from nltk.corpus import stopwords
from string import punctuation
import nltk
nltk.download('stopwords')

import pandas as pd
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()


#imports related to modeling
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [27]:
X_train_df=pd.DataFrame(X_train)
X_train_df

Unnamed: 0,Phrase
150238,make you wish you were at home watching that m...
133360,", the tale has turned from sweet to bitterswee..."
49191,can say that about most of the flicks moving i...
137709,"Does n't deliver a great story ,"
73297,unrecoverable life
...,...
142974,"Laced with liberal doses of dark humor , gorge..."
78231,Offers absolutely nothing I had n't already se...
7375,Ivan is a prince of a fellow
115288,the stomach-knotting suspense


In [28]:
X_train_df['Phrase'].values

array(['make you wish you were at home watching that movie instead of in the theater watching this one',
       ', the tale has turned from sweet to bittersweet , and when the tears come during that final , beautiful scene , they finally feel absolutely earned .',
       'can say that about most of the flicks moving in and out of the multiplex',
       ..., 'Ivan is a prince of a fellow',
       'the stomach-knotting suspense', 'Quirky'], dtype=object)

In [23]:
y_train

150238    1
133360    4
49191     2
137709    1
73297     1
         ..
142974    4
78231     1
7375      3
115288    3
2467      2
Name: Sentiment, Length: 109242, dtype: int64

In [29]:
X_train_df['Phrase_process'] = X_train_df['Phrase'].apply(lambda x: process_TweetText(x))

In [30]:
X_train_df.head()

Unnamed: 0,Phrase,Phrase_process
150238,make you wish you were at home watching that m...,make you wish you were at home watching that m...
133360,", the tale has turned from sweet to bitterswee...",", the tale has turned from sweet to bitterswee..."
49191,can say that about most of the flicks moving i...,can say that about most of the flicks moving i...
137709,"Does n't deliver a great story ,","does n't deliver a great story ,"
73297,unrecoverable life,unrecoverable life


In [31]:
X_train_df['phrase_tokens'] = X_train_df['Phrase_process'].apply(lambda x: tweet_tokenizer.tokenize(x))
X_train_df['phrase_no_stopwords'] = X_train_df['phrase_tokens'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])

In [32]:
X_train_df.head()

Unnamed: 0,Phrase,Phrase_process,phrase_tokens,phrase_no_stopwords
150238,make you wish you were at home watching that m...,make you wish you were at home watching that m...,"[make, you, wish, you, were, at, home, watchin...","[make, wish, home, watching, movie, instead, t..."
133360,", the tale has turned from sweet to bitterswee...",", the tale has turned from sweet to bitterswee...","[,, the, tale, has, turned, from, sweet, to, b...","[,, tale, turned, sweet, bittersweet, ,, tears..."
49191,can say that about most of the flicks moving i...,can say that about most of the flicks moving i...,"[can, say, that, about, most, of, the, flicks,...","[say, flicks, moving, multiplex]"
137709,"Does n't deliver a great story ,","does n't deliver a great story ,","[does, n't, deliver, a, great, story, ,]","[n't, deliver, great, story, ,]"
73297,unrecoverable life,unrecoverable life,"[unrecoverable, life]","[unrecoverable, life]"


## Train your own Embedding

In [33]:
phrase_processed=X_train_df['Phrase_process'].values

In [35]:
#CBOW
import time
start = time.time()
w2v_model = Word2Vec(phrase_processed,min_count=5, sg=0)
end = time.time()
print("CBOW Model Training Complete.\nTime taken for training is:{:.5f} sec ".format((end-start)))

CBOW Model Training Complete.
Time taken for training is:7.45960 sec 


In [37]:
#Create document vectors by averaging word vectors.
def embedding_feats(list_of_lists):
    DIMENSION = 100
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this +=1
        feats.append(feat_for_this/count_for_this if count_for_this > 0 else feat_for_this)        
    return feats

train_vectors = embedding_feats(X_train_df['phrase_no_stopwords'].values)
print(len(train_vectors))

109242


In [40]:
#Take any classifier (LogisticRegression here)
classifier = LogisticRegression(random_state=2020)
classifier.fit(train_vectors, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2020, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)