### Import Packages

In [1]:
import pandas                        as     pd
import numpy                         as     np
import seaborn                       as     sns
import os 
import matplotlib.pyplot             as     plt
from   utils_text_clf                import utils_text_clf as utils
from   sklearn.feature_selection     import f_classif, \
                                            VarianceThreshold, \
                                            SelectKBest
from   sklearn.model_selection       import StratifiedKFold, \
                                            RepeatedStratifiedKFold, \
                                            cross_validate, \
                                            cross_val_predict, \
                                            GridSearchCV, \
                                            train_test_split
from   sklearn.pipeline              import Pipeline, \
                                            make_pipeline
from   sklearn.preprocessing         import StandardScaler, \
                                            RobustScaler, \
                                            MinMaxScaler
from   sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from   sklearn.tree                  import DecisionTreeClassifier
from   sklearn.linear_model          import LogisticRegression, \
                                            SGDClassifier
from   sklearn.svm                   import LinearSVC, SVC
from   sklearn.neighbors             import KNeighborsClassifier
from   sklearn.naive_bayes           import GaussianNB
from   sklearn.ensemble              import RandomForestClassifier, \
                                            AdaBoostClassifier, \
                                            GradientBoostingClassifier, \
                                            StackingClassifier
from   mlxtend.classifier            import StackingClassifier as mlx_stack_clf
from   sklearn.manifold              import TSNE
from   sklearn.decomposition         import PCA
import xgboost                       as     xgb
from   sklearn.metrics               import roc_curve
from   scipy                         import interp
from   pathlib                       import Path
from   pickle                        import dump
import joblib

# Turn interactive plotting off
plt.ion()  
import warnings
warnings.filterwarnings("ignore")

### Enter mutable info

In [2]:
#%% Enter mutable info

data_dir    = os.path.join(os.getcwd(), 'data')
results_dir = os.path.join(os.getcwd(), 'results')

# training data 
#file_train = 'train.jsonl'

# training data
file_train  = 'train_feature_engineering.csv';
file_test   = 'test_feature_engineering.csv'

#file_train = os.path.join(data_dir, file_train) 
file_train  = os.path.join(data_dir, file_train) 
file_test   = os.path.join(data_dir, file_test)

### Load in data 

In [3]:
#%% load in data 

#df_train = utils.parse_json(file_train)
df_train  = pd.read_csv(file_train)

# feats
x_train  = df_train.iloc[:, 1:]

# labels 
y_train  = df_train.label

# convert labels to binary (1 - sarcasm)
y_train  = [1 if i == 'SARCASM' else 0 for i in y_train]

In [4]:
#%% check label proportions 

# print count
print('The count of sarcastic tweets is:', y_train.count(1))
print('The count of non-sarcastic tweets is:', y_train.count(0))

The count of sarcastic tweets is: 2500
The count of non-sarcastic tweets is: 2500


In [5]:
# check the size of the training data 
x_train.shape

(5000, 29)

In [6]:
x_train.describe()

Unnamed: 0,users_tagged,num_hashtags,num_capital,tweet_length_words,tweet_length_char,average_token_length,contains_laughter,contains_ellipses,strong_negations,strong_affirmatives,...,context_tweet_length_char,context_average_token_length,context_contains_laughter,context_contains_ellipses,context_strong_negations,context_strong_affirmatives,context_interjections,context_intensifiers,context_punctuation,context_emojis
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,1.9472,0.7338,5.856,25.3578,100.3864,0.255635,0.032,0.1658,0.2098,0.1786,...,561.159,0.782949,0.0,0.0,0.0,0.0,0.0,0.0,1.9988,1.212
std,0.933266,1.059227,8.217638,13.401869,51.590365,0.042288,0.184886,0.479538,0.491765,0.438566,...,537.603877,0.658256,0.0,0.0,0.0,0.0,0.0,0.0,3.286359,4.211865
min,0.0,0.0,0.0,8.0,24.0,0.117117,0.0,0.0,0.0,0.0,...,86.0,0.281854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,2.0,16.0,64.0,0.227273,0.0,0.0,0.0,0.0,...,250.0,0.404089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,4.0,21.0,86.0,0.253012,0.0,0.0,0.0,0.0,...,392.0,0.547531,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,1.0,7.0,31.0,122.25,0.27957,0.0,0.0,0.0,0.0,...,630.0,0.854243,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0
max,4.0,3.0,165.0,77.0,238.0,0.522727,2.0,5.0,4.0,4.0,...,5788.0,5.404175,0.0,0.0,0.0,0.0,0.0,0.0,70.0,80.0


Remove features with 0 variance 

In [7]:
selector_var = VarianceThreshold()

# select feats with var > 0
selector_var.fit(x_train)

# filter 
x_train = x_train[x_train.columns[selector_var.get_support(indices = True)]]

In [8]:
x_train.head()

Unnamed: 0,users_tagged,num_hashtags,num_capital,tweet_length_words,tweet_length_char,average_token_length,contains_laughter,contains_ellipses,strong_negations,strong_affirmatives,...,punctuation,emojis,ngram_feature,context_users_tagged,context_num_hashtags,context_tweet_length_words,context_tweet_length_char,context_average_token_length,context_punctuation,context_emojis
0,3,0,1,25,99,0.252525,0,3,0,0,...,3,0,31.0,1,1,55,281,0.390073,0,1
1,2,0,4,21,88,0.238636,0,0,0,0,...,1,0,27.0,4,0,27,126,0.429294,1,0
2,3,1,13,14,73,0.191781,0,0,0,0,...,1,0,14.0,1,0,44,196,0.453922,0,0
3,2,0,6,21,108,0.194444,0,0,0,1,...,0,0,25.0,1,0,56,304,0.361325,0,0
4,2,3,22,30,143,0.20979,0,0,0,0,...,0,0,34.0,1,0,63,316,0.430424,0,0


In [9]:
# split into training and validation set 
x_train_sub, x_val, y_train_sub, y_val = train_test_split(x_train, 
                                                          y_train, 
                                                          test_size    = 0.3, 
                                                          random_state = 42, 
                                                          stratify     = y_train)

In [10]:
# common params 

# define scalers to try
scalers     = [StandardScaler(), 
               RobustScaler(), 
               MinMaxScaler()]

# define cross-val method
cv          = StratifiedKFold(n_splits     = 10, 
                              shuffle      = True, 
                              random_state = 42)

# define scoring metric
metric      = 'f1'

### Logistic regression classifier

In [11]:
#%% log_reg clf

# base clf
logreg_clf    = LogisticRegression(n_jobs       = -1, 
                                   class_weight = 'balanced', 
                                   random_state = 42)

# create model pipeline 
pipe_logreg   = Pipeline([('scaler',     StandardScaler()),
                          ('classifier', logreg_clf)])

# define param grid
params_logreg = {'scaler'                   : scalers,
                 'classifier'               : [logreg_clf],
                 'classifier__penalty'      : ['l2'],
                 'classifier__C'            : np.logspace(-3, 3, 12),
                 'classifier__max_iter'     : [20000], 
                 'classifier__class_weight' : ['balanced']}

grid_logreg   = GridSearchCV(pipe_logreg, 
                             cv                 = cv, 
                             param_grid         = params_logreg, 
                             scoring            = metric,
                             refit              = True, 
                             return_train_score = False, 
                             n_jobs             = -1, 
                             verbose            = 1)

# perform tuning and extract best model
best_logreg = grid_logreg.fit(x_val, y_val).best_estimator_

print('tuning log_reg clf complete')
print('Best parameters: %s' % grid_logreg.best_params_)
print('Mean cross-validated F1: %.2f' % grid_logreg.best_score_)


Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.9s


tuning log_reg clf complete
Best parameters: {'classifier': LogisticRegression(C=0.1519911082952933, class_weight='balanced',
                   max_iter=20000, n_jobs=-1, random_state=42), 'classifier__C': 0.1519911082952933, 'classifier__class_weight': 'balanced', 'classifier__max_iter': 20000, 'classifier__penalty': 'l2', 'scaler': RobustScaler()}
Mean cross-validated F1: 0.75


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    8.7s finished


### Linear SVC

In [12]:
#%% svc_lin

# base clf
svc_lin_clf    = LinearSVC(max_iter     = 20000, 
                           class_weight = 'balanced', 
                           random_state = 42)

# create model pipeline 
pipe_svc_lin   = Pipeline([('scaler',     StandardScaler()),
                           ('classifier', svc_lin_clf)])

# define param grid
params_svc_lin = {'scaler'                   : scalers,
                  'classifier'               : [svc_lin_clf],
                  'classifier__penalty'      : ['l1', 'l2'],
                  'classifier__loss'         : ['hinge', 'squared_hinge'],
                  'classifier__C'            : np.logspace(-3, 3, 12),
                  'classifier__max_iter'     : [20000], 
                  'classifier__class_weight' : ['balanced']}

grid_svc_lin   = GridSearchCV(pipe_svc_lin, 
                              cv                 = cv, 
                              param_grid         = params_svc_lin, 
                              scoring            = metric,
                              refit              = True, 
                              return_train_score = False, 
                              n_jobs             = -1, 
                              verbose            = 1)

# perform tuning and extract best model
best_svc_lin   = grid_svc_lin.fit(x_val, y_val).best_estimator_

print('tuning svc_lin clf complete')
print('Best parameters: %s' % grid_svc_lin.best_params_)
print('Mean cross-validated F1: %.2f' % grid_svc_lin.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 1013 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 1338 tasks      | elapsed:  1.1min


tuning svc_lin clf complete
Best parameters: {'classifier': LinearSVC(C=23.10129700083158, class_weight='balanced', loss='hinge',
          max_iter=20000, random_state=42), 'classifier__C': 23.10129700083158, 'classifier__class_weight': 'balanced', 'classifier__loss': 'hinge', 'classifier__max_iter': 20000, 'classifier__penalty': 'l2', 'scaler': MinMaxScaler()}
Mean cross-validated F1: 0.76


[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  1.5min finished


### SGD

In [13]:
#%% sgd clf

# base clf
sgd_clf      = SGDClassifier(max_iter       = 20000,
                             tol            = 1e-4, 
                             class_weight   = 'balanced', 
                             early_stopping = True)

# create model pipeline 
pipe_sgd     = Pipeline([('scaler',     StandardScaler()),
                         ('classifier', sgd_clf)])

# define param grid
params_sgd   = {'scaler'                   : scalers,
                'classifier'               : [sgd_clf],
                'classifier__penalty'      : ['l1', 'l2', 'elasticnet'],
                'classifier__loss'         : ['hinge', 'squared_hinge', 'log', 'perceptron']}

grid_sgd     = GridSearchCV(pipe_sgd, 
                            cv                 = cv, 
                            param_grid         = params_sgd, 
                            scoring            = metric,
                            refit              = True, 
                            return_train_score = False, 
                            n_jobs             = -1, 
                            verbose            = 1)

# perform tuning and extract best model
best_sgd     = grid_sgd.fit(x_val, y_val).best_estimator_

print('tuning sgd clf complete')
print('Best parameters: %s' % grid_sgd.best_params_)
print('Mean cross-validated F1: %.2f' % grid_sgd.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.5s


tuning sgd clf complete
Best parameters: {'classifier': SGDClassifier(class_weight='balanced', early_stopping=True, loss='log',
              max_iter=20000, penalty='l1', tol=0.0001), 'classifier__loss': 'log', 'classifier__penalty': 'l1', 'scaler': MinMaxScaler()}
Mean cross-validated F1: 0.73


[Parallel(n_jobs=-1)]: Done 344 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    2.5s finished


### LDA

In [14]:
#%% lda

# base clf
lda_clf      = LinearDiscriminantAnalysis()

# create model pipeline 
pipe_lda     = Pipeline([('scaler',     StandardScaler()),
                         ('classifier', lda_clf)])

# define param grid
params_lda   = {'scaler'                   : scalers,
                'classifier'               : [lda_clf],
                'classifier__solver'       : ['svd', 'lsqr', 'eigen'],
                'classifier__shrinkage'    : np.arange(0, 1, 0.01)}

grid_lda     = GridSearchCV(pipe_lda, 
                            cv                 = cv, 
                            param_grid         = params_lda, 
                            scoring            = metric,
                            refit              = True, 
                            return_train_score = False, 
                            n_jobs             = -1, 
                            verbose            = 1)

# perform tuning and extract best model
best_lda     = grid_lda.fit(x_val, y_val).best_estimator_

print('tuning lda clf complete')
print('Best parameters: %s' % grid_lda.best_params_)
print('Mean cross-validated F1: %.2f' % grid_lda.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 900 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1640 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 3712 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 7312 tasks      | elapsed:   31.2s


tuning lda clf complete
Best parameters: {'classifier': LinearDiscriminantAnalysis(shrinkage=0.07, solver='lsqr'), 'classifier__shrinkage': 0.07, 'classifier__solver': 'lsqr', 'scaler': RobustScaler()}
Mean cross-validated F1: 0.75


[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:   37.5s finished


In [15]:
# save session 
import dill 
dill.dump_session('dev_split_hparam_tuning_complete.db')
print('session saved!')

session saved!


Retrain best model on the **training subset**

In [16]:
# hyperparam tuned models 

svc_lin_final   = best_svc_lin.fit(x_train_sub, y_train_sub) # good 
logreg_final    = best_logreg.fit(x_train_sub, y_train_sub)  # good
lda_final       = best_lda.fit(x_train_sub, y_train_sub)
sgd_final       = best_sgd.fit(x_train_sub, y_train_sub)

print('all done!')

all done!


Load test data 

In [17]:
# process test data 
df_test  = pd.read_csv(file_test)

# feats
x_test   = df_test.iloc[:, 1:]

# remove low var feats (as in x_train)
x_test   = x_test[x_test.columns[selector_var.get_support(indices = True)]]

# tweet id
t_id     = df_test.id.to_frame()

In [18]:
# gather models 

final_models = {'lda'      : lda_final,
                'sgd'      : sgd_final,
                'svc_lin'  : svc_lin_final,
                'logreg'   : logreg_final}

In [19]:
#%% make prediction 

for name, model in final_models.items(): 
    
    # make prediction 
    pred     = model.predict(x_test)
    
    # convert to text labels
    pred     = ['SARCASM' if i == 1 else 'NOT_SARCASM' for i in pred]
    
    pred     = pd.DataFrame(pred, columns = ['predictions'])

    # concat into df
    answer   = pd.concat([t_id, pred], axis = 1)
    
    # construct file name 
    file_ans = Path(os.path.join(os.getcwd(), 'answer_split_' + name + '.txt'))
    
    # name the file, based on the classifier
    answer.to_csv(file_ans, header = None, index = None, sep = ',')
    
print('all predictions completed!')

all predictions completed!
