In [1]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
from time import time
import re
import string
import os
import emoji
from pprint import pprint
import collections
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
import gensim
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')
np.random.seed(37)

In [4]:
#df = pd.read_csv('Tweets.csv')
#df = df.reindex(np.random.permutation(df.index))
#df = df[['text', 'airline_sentiment']]

In [2]:
twitterdata = pd.read_csv("dataPandas.csv")
twitterdata = twitterdata.reindex(np.random.permutation(twitterdata.index))




In [3]:
class TextCounts(BaseEstimator, TransformerMixin):
    
    def count_regex(self, pattern, tweet):
        return len(re.findall(pattern, tweet))
    
    def fit(self, X, y=None, **fit_params):
        # fit method is used when specific operations need to be done on the train data, but not on the test data
        return self
    
    def transform(self, X, **transform_params):
        count_words = X.apply(lambda x: self.count_regex(r'\w+', x)) 
        count_mentions = X.apply(lambda x: self.count_regex(r'@\w+', x))
        count_hashtags = X.apply(lambda x: self.count_regex(r'#\w+', x))
        count_capital_words = X.apply(lambda x: self.count_regex(r'\b[A-Z]{2,}\b', x))
        count_excl_quest_marks = X.apply(lambda x: self.count_regex(r'!|\?', x))
        count_urls = X.apply(lambda x: self.count_regex(r'http.?://[^\s]+[\s]?', x))
        # We will replace the emoji symbols with a description, which makes using a regex for counting easier
        # Moreover, it will result in having more words in the tweet
        count_emojis = X.apply(lambda x: emoji.demojize(x)).apply(lambda x: self.count_regex(r':[a-z_&]+:', x))
        
        df = pd.DataFrame({'count_words': count_words
                           , 'count_mentions': count_mentions
                           , 'count_hashtags': count_hashtags
                           , 'count_capital_words': count_capital_words
                           , 'count_excl_quest_marks': count_excl_quest_marks
                           , 'count_urls': count_urls
                           , 'count_emojis': count_emojis
                          })
        
        return df
    
tc = TextCounts()
#df_eda = tc.fit_transform(df.text)
#df_eda['airline_sentiment'] = df.airline_sentiment

#twitterdata_eda = pd.read_csv("dataPandas.csv")
twitterdata_eda = tc.fit_transform(twitterdata.Tweets)
twitterdata_eda['Labels'] = twitterdata.Labels

In [4]:
class CleanText(BaseEstimator, TransformerMixin):
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_urls(self, input_text):
        return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        return input_text.replace('_','')
    
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)    
   
    def remove_digits(self, input_text):
        return re.sub('\d+', '', input_text)
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def stemming(self, input_text):
        porter = PorterStemmer()
        words = input_text.split() 
        stemmed_words = [porter.stem(word) for word in words]
        return " ".join(stemmed_words)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords).apply(self.stemming)
        return clean_X

In [5]:
ct = CleanText()
#sr_clean = ct.fit_transform(df.text)
#sr_clean.sample(5)
sr_clean_td = ct.fit_transform(twitterdata.Tweets)


In [6]:
empty_clean = sr_clean_td == ''
print('{} records have no words left after text cleaning'.format(sr_clean_td[empty_clean].count()))
#sr_clean.loc[empty_clean] = '[no_text]'
sr_clean_td.loc[empty_clean] = '[no_text]'

0 records have no words left after text cleaning


In [10]:
cv = CountVectorizer()
bow = cv.fit_transform(sr_clean)

cv_ = CountVectorizer()
bow = cv_.fit_transform(sr_clean_td)
word_freq = dict(zip(cv.get_feature_names(), np.asarray(bow.sum(axis=0)).ravel()))
word_counter = collections.Counter(word_freq)
word_counter_df = pd.DataFrame(word_counter.most_common(20), columns = ['word', 'freq'])

In [7]:
#df_model = df_eda
#df_model['clean_text'] = sr_clean
#print(df_model.columns.tolist())
df_model_td = twitterdata_eda
df_model_td['clean_text'] = sr_clean_td
print(df_model_td.columns.tolist())

['count_words', 'count_mentions', 'count_hashtags', 'count_capital_words', 'count_excl_quest_marks', 'count_urls', 'count_emojis', 'Labels', 'clean_text']


In [8]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols   
    def transform(self, X, **transform_params):
        return X[self.cols]    
    def fit(self, X, y=None, **fit_params):
        return self
    
#X_train, X_test, y_train, y_test = train_test_split(df_model.drop('airline_sentiment', axis=1), df_model.airline_sentiment, test_size=0.1, random_state=37)
X_train1, X_test1, y_train1, y_test1 = train_test_split(df_model_td.drop('Labels', axis=1), df_model_td.Labels, test_size=0.1, random_state=37)

In [28]:
# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None, is_w2v=False):
    
    textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    
    if is_w2v:
        w2vcols = []
        for i in range(SIZE):
            w2vcols.append(i)
        features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                                 , ('w2v', ColumnExtractor(cols=w2vcols))]
                                , n_jobs=-1)
    else:
        features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                                 , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))]
                                , n_jobs=-1)    
    pipeline = Pipeline([
        ('features', features)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)    # Make sure you have scikit-learn version 0.19 or higher to use multiple scoring metrics
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)    
    t0 = time()
    print(X_train.columns.tolist())
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()    
    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))
                        
    return grid_search

In [9]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
#parameters_vect = {
#    'features__pipe__vect__max_df':  (0.25,0.75),
#    'features__pipe__vect__ngram_range': ((1, 2)),
#    'features__pipe__vect__min_df': (1,2)
#}

#parameters_vect = {
#    'features__pipe__vect__max_df': (0.25, 0.5, 0.75, 0.9),
#    'features__pipe__vect__ngram_range': ((1, 1), (1, 2) , (2,2)),
#    'features__pipe__vect__min_df': (1,2),
#    'features__pipe__vect__sublinear_tf' : (True , False),
#    'features__pipe__vect__max_features' : (None , 500 , 1000 , 5000)
#}

parameters_vect = {
    'features__pipe__vect__max_df': ( 0.25, 0.5, 0.75, 0.9),
    'features__pipe__vect__ngram_range': ((1, 1), (1, 2) ,(2,2)),
    'features__pipe__vect__min_df': (1,2),
    'features__pipe__vect__sublinear_tf' : (True , False)
}

# Parameter grid settings for MultinomialNB
parameters_mnb = {
    'clf__alpha': (0.25, 0.5, 0.75)
}
# Parameter grid settings for LogisticRegression
#parameters_logreg = {
#    'clf__C': (0.25,0.5,1.0,2.0,5.0,10.0,100.0,1000.0,10000.0),
 #   'clf__C': (0.25, 0.5, 1.0),

#    'clf__penalty': ('l1' , 'l2')
    #'clf__max_iter': (100,500,1000,5000)
#}
parameters_logreg = {
    'clf__C': (0.25,0.5,1.0,2.0,3.0,4.0,5.0,6.0,7.0),
    'clf__penalty': ('l1', 'l2')
}

In [10]:
mnb = MultinomialNB()
logreg = LogisticRegression()

In [31]:
X_train.head()

Unnamed: 0,count_words,count_mentions,count_hashtags,count_capital_words,count_excl_quest_marks,count_urls,count_emojis,clean_text
1712,18,1,2,0,0,0,0,synonym rude cheap realli know valu custom
11975,23,1,0,0,2,0,0,flight jacksonvil fl dalla show cancel flightl go rebook anoth flight
6919,22,1,0,0,1,0,0,chging flight isit possibl pay fare differ dollar ticket bought complet via point redempt
14030,29,1,0,1,0,0,0,add get go work tomorrow like wait late flightr cs suck
1907,25,1,0,0,0,0,0,also inappropri lie passeng induc accept voucher tell avail late flightr


In [32]:
countvect = CountVectorizer()# MultinomialNB
best_mnb_countvect = grid_vect(mnb, parameters_mnb, X_train, X_test, parameters_text=parameters_vect, vect=countvect)
#joblib.dump(best_mnb_countvect, '../output/best_mnb_countvect.pkl')# LogisticRegression
best_logreg_countvect = grid_vect(logreg, parameters_logreg, X_train, X_test, parameters_text=parameters_vect, vect=countvect)
#joblib.dump(best_logreg_countvect, '../output/best_logreg_countvect.pkl')

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
['count_words', 'count_mentions', 'count_hashtags', 'count_capital_words', 'count_excl_quest_marks', 'count_urls', 'count_emojis', 'clean_text']
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   29.8s finished


done in 30.990s

Best CV score: 0.776
Best parameters set:
	clf__alpha: 0.5
	features__pipe__vect__max_df: 0.5
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.784


Classification Report Test Data
              precision    recall  f1-score   support

    negative       0.82      0.91      0.86       911
     neutral       0.66      0.48      0.56       320
    positive       0.77      0.72      0.74       233

   micro avg       0.78      0.78      0.78      1464
   macro avg       0.75      0.70      0.72      1464
weighted avg       0.77      0.78      0.77      1464

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
['count_words', 'count_mentions', 'count_hashtags', 'count_capital_words', 'count_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.4min finished


done in 84.555s

Best CV score: 0.795
Best parameters set:
	clf__C: 1.0
	clf__penalty: 'l2'
	features__pipe__vect__max_df: 0.5
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.811


Classification Report Test Data
              precision    recall  f1-score   support

    negative       0.83      0.93      0.88       911
     neutral       0.73      0.53      0.62       320
    positive       0.80      0.74      0.77       233

   micro avg       0.81      0.81      0.81      1464
   macro avg       0.79      0.73      0.75      1464
weighted avg       0.81      0.81      0.80      1464



In [11]:
def grid_vect1(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None, is_w2v=False):
    
    textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    #textcountscols = ['count_emojis','count_words']
    
    if is_w2v:
        w2vcols = []
        for i in range(SIZE):
            w2vcols.append(i)
        features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                                 , ('w2v', ColumnExtractor(cols=w2vcols))]
                                , n_jobs=-1)
    else:
        features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                                 , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))]
                                , n_jobs=-1)    
    pipeline = Pipeline([
        ('features', features)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)    # Make sure you have scikit-learn version 0.19 or higher to use multiple scoring metrics
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)    
    t0 = time()
    print(X_train.columns.tolist())
    grid_search.fit(X_train, y_train1)
    print("done in %0.3fs" % (time() - t0))
    print()    
    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test1))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test1, grid_search.best_estimator_.predict(X_test)))
                        
    return grid_search

In [93]:
countvect = CountVectorizer()# MultinomialNB
#best_mnb_countvect = grid_vect1(mnb, parameters_mnb, X_train1, X_test1, parameters_text=parameters_vect, vect=countvect)
#joblib.dump(best_mnb_countvect, '../output/best_mnb_countvect.pkl')# LogisticRegression
#best_logreg_countvect = grid_vect1(logreg, parameters_logreg, X_train1, X_test1, parameters_text=parameters_vect, vect=countvect)
#joblib.dump(best_logreg_countvect, '../output/best_logreg_countvect.pkl')

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0, 1000.0, 10000.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75, 0.9),
 'features__pipe__vect__max_features': (None, 500, 1000),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2), (2, 2)),
 'features__pipe__vect__sublinear_tf': (True, False)}
['count_words', 'count_mentions', 'count_hashtags', 'count_capital_words', 'count_excl_quest_marks', 'count_urls', 'count_emojis', 'clean_text', 'clean_text_wordlist']
Fitting 5 folds for each of 2592 candidates, totalling 12960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


ValueError: Invalid parameter sublinear_tf for estimator CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None). Check the list of available parameters with `estimator.get_params().keys()`.

In [12]:
tfidfvect = TfidfVectorizer()# MultinomialNB
#best_mnb_tfidf = grid_vect1(mnb, parameters_mnb, X_train1, X_test1, parameters_text=parameters_vect, vect=tfidfvect)
#joblib.dump(best_mnb_tfidf, '../output/best_mnb_tfidf.pkl')# LogisticRegression
best_logreg_tfidf = grid_vect1(logreg, parameters_logreg, X_train1, X_test1, parameters_text=parameters_vect, vect=tfidfvect)
#joblib.dump(best_logreg_tfidf, '../output/best_logreg_tfidf.pkl')

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75, 0.9),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2), (2, 2)),
 'features__pipe__vect__sublinear_tf': (True, False)}
['count_words', 'count_mentions', 'count_hashtags', 'count_capital_words', 'count_excl_quest_marks', 'count_urls', 'count_emojis', 'clean_text']
Fitting 5 folds for each of 864 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 30.6min finished


done in 1845.458s

Best CV score: 0.775
Best parameters set:
	clf__C: 3.0
	clf__penalty: 'l2'
	features__pipe__vect__max_df: 0.5
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 2)
	features__pipe__vect__sublinear_tf: True
Test score with best_estimator_: 0.776


Classification Report Test Data
              precision    recall  f1-score   support

 NOT_SARCASM       0.80      0.73      0.76       249
     SARCASM       0.75      0.82      0.79       251

   micro avg       0.78      0.78      0.78       500
   macro avg       0.78      0.78      0.78       500
weighted avg       0.78      0.78      0.78       500



In [31]:
SIZE = 50
X_train1['clean_text_wordlist'] = X_train1.clean_text.apply(lambda x : word_tokenize(x))
X_test1['clean_text_wordlist'] = X_test1.clean_text.apply(lambda x : word_tokenize(x))
model = gensim.models.Word2Vec(X_train1.clean_text_wordlist
, min_count=1
, size=SIZE
, window=5
, workers=4)

model.most_similar('trump', topn=3)

[('vote', 0.999025285243988),
 ('presid', 0.9915638566017151),
 ('elect', 0.9825637340545654)]

In [32]:
def compute_avg_w2v_vector(w2v_dict, tweet):
    list_of_word_vectors = [w2v_dict[w] for w in tweet if w in w2v_dict.vocab.keys()]
    
    if len(list_of_word_vectors) == 0:
        result = [0.0]*SIZE
    else:
        result = np.sum(list_of_word_vectors, axis=0) / len(list_of_word_vectors)
        
    return result

X_train1_w2v = X_train1['clean_text_wordlist'].apply(lambda x: compute_avg_w2v_vector(model.wv, x))
X_test1_w2v = X_test1['clean_text_wordlist'].apply(lambda x: compute_avg_w2v_vector(model.wv, x))

In [33]:
X_train1_w2v = pd.DataFrame(X_train1_w2v.values.tolist(), index= X_train1.index)
X_test1_w2v = pd.DataFrame(X_test1_w2v.values.tolist(), index= X_test1.index)# Concatenate with the TextCounts variables
X_train1_w2v = pd.concat([X_train1_w2v, X_train1.drop(['clean_text', 'clean_text_wordlist'], axis=1)], axis=1)
X_test1_w2v = pd.concat([X_test1_w2v, X_test1.drop(['clean_text', 'clean_text_wordlist'], axis=1)], axis=1)

In [35]:
best_logreg_w2v = grid_vect1(logreg, parameters_logreg, X_train1_w2v, X_test1_w2v, is_w2v=True)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0), 'clf__penalty': ('l1', 'l2')}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 'count_words', 'count_mentions', 'count_hashtags', 'count_capital_words', 'count_excl_quest_marks', 'count_urls', 'count_emojis']
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


done in 9.239s

Best CV score: 0.714
Best parameters set:
	clf__C: 1.0
	clf__penalty: 'l2'
Test score with best_estimator_: 0.716


Classification Report Test Data
              precision    recall  f1-score   support

 NOT_SARCASM       0.76      0.66      0.71       262
     SARCASM       0.68      0.77      0.72       238

   micro avg       0.72      0.72      0.72       500
   macro avg       0.72      0.72      0.72       500
weighted avg       0.72      0.72      0.72       500



[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    9.0s finished


In [13]:
textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
,'count_mentions','count_urls','count_words']

features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
, ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
, ('vect', TfidfVectorizer(max_df=0.5, min_df=1, ngram_range=(1,2) , sublinear_tf= True))]))]
, n_jobs=-1)

pipeline = Pipeline([('features', features)
, ('clf', LogisticRegression(C=3, penalty='l2'))
])

#features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
#, ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
#, ('vect', TfidfVectorizer(max_df=0.5, min_df=1, ngram_range=(1,2) , sublinear_tf= False))]))]
#, n_jobs=-1)


#pipeline = Pipeline([('features', features)
#, ('clf', LogisticRegression(C=7, penalty='l2'))
#])

best_model = pipeline.fit(df_model_td.drop('Labels', axis=1), df_model_td.Labels)
#(df_model_td.drop('Labels', axis=1), df_model_td.Labels, test_size=0.1, random_state=37)

In [14]:
twittertestdata = pd.read_csv("dftestdata.csv")
#twittertestdata.Tweets = twittertestdata['Tweets'].apply(preprocess_tweet_text)

#tf_vector_test = get_feature_vector(np.array(twittertestdata.iloc[:, 1]).ravel())
#X_testing = tf_vector.transform(np.array(twittertestdata.iloc[:, 1]).ravel())

df_counts_pos = tc.transform(twittertestdata['Tweets'])
df_clean_pos = ct.transform(twittertestdata['Tweets'])
df_model_pos = df_counts_pos
df_model_pos['clean_text'] = df_clean_pos

In [15]:
best_prediction = best_model.predict(df_model_pos)

In [16]:
twittertestdata['PredictLG'] = best_prediction


In [17]:
twittertestdata.to_csv('answerCVLG.txt', columns = ["ID" , "PredictLG"] , header = False , index = False)
#twittertestdata.head()

In [61]:
textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
,'count_mentions','count_urls','count_words']

features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
, ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
, ('vect', CountVectorizer(max_df=0.5, min_df=1, ngram_range=(1,2)))]))]
, n_jobs=-1)

pipeline = Pipeline([('features', features)
, ('clf', MultinomialNB(alpha=0.5))
])

best_modelNB = pipeline.fit(df_model_td.drop('Labels', axis=1), df_model_td.Labels)
#(df_model_td.drop('Labels', axis=1), df_model_td.Labels, test_size=0.1, random_state=37)

In [62]:
best_predictionNB = best_modelNB.predict(df_model_pos)

In [63]:
twittertestdata['PredictNB'] = best_predictionNB

In [64]:
twittertestdata.to_csv('answerNB.txt', columns = ["ID" , "PredictNB"] , header = False , index = False)
