In [17]:
import time

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import ttest_ind

np.warnings.filterwarnings('ignore', category=DeprecationWarning)
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# set Jupyter to display ALL output from a cell (not just last output)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# set pandas and numpy options to mae print format nicer
pd.set_option("display.width",100)
pd.set_option("display.max_columns",100)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
np.set_printoptions(linewidth=120, threshold=5000, edgeitems=50, suppress=True)

seed = 42

## Read Loans csv and Create test/train csv files

In [18]:
np.random.seed(seed=seed)

print('reading: data.tsv into revs dataframe...')
revs = pd.read_csv('data.tsv', sep=' ', quotechar='"', escapechar='\\')
print('revs dataframe:', revs.shape)

splits = pd.read_csv('splits.csv',sep='\t', dtype={'split_1':int,'split_2':int, 'split_3':int,})
print('ids dataframe:', splits.shape)

trains = []
tests = []
labels = revs[['new_id','sentiment']]
for i, col in enumerate(splits.columns):
    trains.append(revs.loc[~revs.new_id.isin(splits[col]),:])
    tests.append( revs.loc[ revs.new_id.isin(splits[col]), revs.columns!='sentiment'])
    print('Split', i+1, trains[i].shape, tests[i].shape)

print('Writing train, test, labels csv files...')
fold=0
_ = trains[fold].to_csv('train.csv', index=False)
_ = tests [fold].to_csv('test.csv',  index=False)
print('Files Saved')

reading: data.tsv into revs dataframe...
revs dataframe: (50000, 3)
ids dataframe: (25000, 3)
Split 1 (25000, 3) (25000, 2)
Split 2 (25000, 3) (25000, 2)
Split 3 (25000, 3) (25000, 2)
Writing train, test, labels csv files...
Files Saved


## Model Tuning with skopt

In [4]:
from skopt import gp_minimize, gbrt_minimize
from skopt.plots import plot_convergence
import datetime, warnings

def objective(values):  
    index = str(values)
    if index in cache:
        print('GET FROM CACHE:', index, round(cache[index],4))
        return cache[index]
    
    stop_words = []
    cv = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,values[2]), min_df=values[3], max_df=values[4])  
    X_train = cv.fit_transform(train.review).toarray()
    X_test  = cv.transform(test.review).toarray()
    
    y_train = train.sentiment
    y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')
    vocab = np.array(cv.get_feature_names())
    t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])
    
    voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
    voc_df['magn_tstat'] = voc_df.tstat.abs()
    voc_df = voc_df.sort_values('magn_tstat',ascending=False)
    vocab_slim = voc_df.word.values[0:values[5]]

    indices = np.where(np.in1d(vocab, vocab_slim))[0]
    X_train = X_train[:, indices]
    X_test  = X_test [:, indices]

    params = {'penalty': values[0], 'C': values[1]}
    model = LogisticRegression(**params, random_state=seed, n_jobs=-1)

    _ = model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:,1]
    print(datetime.datetime.now().time().replace(microsecond=0),', Params=',values, X_train.shape[1]) 
    cache[index] = -roc_auc_score(y_test.sentiment, probs)
    return cache[index]

## 0.96501 is current max with params as per project4.ipynb

In [5]:
import warnings
np.random.seed(seed)
warnings.filterwarnings("ignore", category=UserWarning) # turn off already evaluated errors
params={'LogisticRegression': [ 
                ['l1','l2'],
                (8, 50,'log-uniform'), # C
                (2, 5),                # ngram range high
                (15, 150),              # min_df
                (0.2, 1.0, 'uniform'),  # max_df
                (2500, 2999),           # vocab_size
                ],}

fold=0
train = trains[fold].copy()
test  = tests[fold].copy()
train['review'] = revs.review.str.replace('<br /><br />',' ')
test ['review'] =  revs.review.str.replace('<br /><br />',' ')

cache = {}
space = params['LogisticRegression']
result = gbrt_minimize(objective,space,n_random_starts=10, n_calls=500, random_state=seed,verbose=True,n_jobs=-1)

print('Best Params=', result.x, ' Best Score=', round(result.fun,6),'\n')
result['func_vals'] = -result.func_vals
_ = plt.figure(figsize=(15,8))
_ = plot_convergence(result, yscale='log')

warnings.filterwarnings("default", category=UserWarning) # turn on already evaluated errors

Iteration No: 1 started. Evaluating function at random point.
17:51:06 , End Params= ['l1', 25.369990766817722, 4, 81, 0.6789267873576295, 1621] 1621
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 75.4505
Function value obtained: -0.9559
Current minimum: -0.9559
Iteration No: 2 started. Evaluating function at random point.
17:52:46 , End Params= ['l1', 1.218418650222176, 5, 382, 0.6808920093945672, 1630] 1630
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 101.0225
Function value obtained: -0.9494
Current minimum: -0.9559
Iteration No: 3 started. Evaluating function at random point.
17:54:33 , End Params= ['l1', 27.081608642499685, 5, 423, 0.36987128854262097, 2715] 2065
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 106.8128
Function value obtained: -0.9416
Current minimum: -0.9559
Iteration No: 4 started. Evaluating function at random point.
17:55:49 , End Params= ['l2', 8.167611317882542, 3, 31, 0.20565304417577393, 2356]

18:26:41 , End Params= ['l2', 17.400006953262093, 3, 25, 0.9095231463212952, 2951] 2951
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 70.0693
Function value obtained: -0.9630
Current minimum: -0.9630
Iteration No: 30 started. Searching for the next optimal point.
18:27:01 , End Params= ['l1', 21.94310839634779, 2, 498, 0.20848673724103872, 1757] 1569
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 19.9188
Function value obtained: -0.9368
Current minimum: -0.9630
Iteration No: 31 started. Searching for the next optimal point.
18:29:07 , End Params= ['l2', 17.263126172526906, 5, 26, 0.256111514846244, 2987] 2987
Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 126.6069
Function value obtained: -0.9631
Current minimum: -0.9631
Iteration No: 32 started. Searching for the next optimal point.
18:30:34 , End Params= ['l2', 8.498092439115318, 3, 19, 0.9657390019572305, 2975] 2975
Iteration No: 32 e

19:02:02 , End Params= ['l2', 15.320165234167213, 2, 15, 0.37290580758312025, 2960] 2960
Iteration No: 57 ended. Search finished for the next optimal point.
Time taken: 54.7370
Function value obtained: -0.9631
Current minimum: -0.9631
Iteration No: 58 started. Searching for the next optimal point.
19:04:21 , End Params= ['l2', 24.646633658726657, 5, 21, 0.3157465832763975, 2962] 2962
Iteration No: 58 ended. Search finished for the next optimal point.
Time taken: 140.9363
Function value obtained: -0.9629
Current minimum: -0.9631
Iteration No: 59 started. Searching for the next optimal point.
19:04:44 , End Params= ['l2', 9.534277554995878, 2, 441, 0.20943436915053049, 2030] 1796
Iteration No: 59 ended. Search finished for the next optimal point.
Time taken: 20.8854
Function value obtained: -0.9413
Current minimum: -0.9631
Iteration No: 60 started. Searching for the next optimal point.
19:06:17 , End Params= ['l2', 15.224094037713582, 4, 27, 0.43054265162304795, 2972] 2972
Iteration No: 

19:41:47 , End Params= ['l2', 13.575057971315202, 4, 40, 0.5738528399264179, 2972] 2972
Iteration No: 85 ended. Search finished for the next optimal point.
Time taken: 83.6939
Function value obtained: -0.9631
Current minimum: -0.9633
Iteration No: 86 started. Searching for the next optimal point.
19:43:09 , End Params= ['l1', 29.729774984040404, 4, 49, 0.5553069668338348, 2003] 2003
Iteration No: 86 ended. Search finished for the next optimal point.
Time taken: 81.8400
Function value obtained: -0.9556
Current minimum: -0.9633
Iteration No: 87 started. Searching for the next optimal point.
19:45:43 , End Params= ['l1', 2.9798274535861284, 4, 12, 0.5800943230937727, 1706] 1706
Iteration No: 87 ended. Search finished for the next optimal point.
Time taken: 154.1572
Function value obtained: -0.9571
Current minimum: -0.9633
Iteration No: 88 started. Searching for the next optimal point.
19:47:39 , End Params= ['l2', 29.83198620525975, 5, 37, 0.9905948176764485, 1600] 1600
Iteration No: 88 e

20:20:43 , End Params= ['l2', 28.539221200538737, 3, 41, 0.7582262692096411, 2981] 2981
Iteration No: 113 ended. Search finished for the next optimal point.
Time taken: 54.8426
Function value obtained: -0.9621
Current minimum: -0.9633
Iteration No: 114 started. Searching for the next optimal point.
20:22:32 , End Params= ['l2', 29.407945797189836, 5, 47, 0.6206448104673034, 2983] 2983
Iteration No: 114 ended. Search finished for the next optimal point.
Time taken: 109.5767
Function value obtained: -0.9617
Current minimum: -0.9633
Iteration No: 115 started. Searching for the next optimal point.
20:22:53 , End Params= ['l2', 20.12934580335168, 2, 490, 0.20358503249983997, 2343] 1598
Iteration No: 115 ended. Search finished for the next optimal point.
Time taken: 19.7608
Function value obtained: -0.9380
Current minimum: -0.9633
Iteration No: 116 started. Searching for the next optimal point.
20:24:31 , End Params= ['l2', 29.429161857269754, 5, 498, 0.21904649790422903, 2050] 1659
Iteratio

Iteration No: 140 ended. Search finished for the next optimal point.
Time taken: 71.6363
Function value obtained: -0.9433
Current minimum: -0.9633
Iteration No: 141 started. Searching for the next optimal point.
20:53:45 , End Params= ['l1', 1.1705713478202913, 2, 37, 0.34955630179440433, 2690] 2690
Iteration No: 141 ended. Search finished for the next optimal point.
Time taken: 32.4781
Function value obtained: -0.9547
Current minimum: -0.9633
Iteration No: 142 started. Searching for the next optimal point.
20:55:42 , End Params= ['l1', 28.658740553758722, 5, 34, 0.22249982743122043, 2964] 2964
Iteration No: 142 ended. Search finished for the next optimal point.
Time taken: 118.3019
Function value obtained: -0.9516
Current minimum: -0.9633
Iteration No: 143 started. Searching for the next optimal point.
20:57:56 , End Params= ['l2', 29.557623462437082, 5, 22, 0.9777576850857743, 2831] 2831
Iteration No: 143 ended. Search finished for the next optimal point.
Time taken: 134.4177
Functio

21:30:33 , End Params= ['l1', 1.5710316304257779, 4, 36, 0.3386263311735124, 2929] 2929
Iteration No: 168 ended. Search finished for the next optimal point.
Time taken: 86.8543
Function value obtained: -0.9569
Current minimum: -0.9633
Iteration No: 169 started. Searching for the next optimal point.
21:32:14 , End Params= ['l2', 16.555125081953, 5, 456, 0.20530100513695615, 2252] 1827
Iteration No: 169 ended. Search finished for the next optimal point.
Time taken: 101.3969
Function value obtained: -0.9401
Current minimum: -0.9633
Iteration No: 170 started. Searching for the next optimal point.
21:32:58 , End Params= ['l2', 29.84065747783885, 2, 19, 0.30442539898594945, 2718] 2718
Iteration No: 170 ended. Search finished for the next optimal point.
Time taken: 42.4923
Function value obtained: -0.9621
Current minimum: -0.9633
Iteration No: 171 started. Searching for the next optimal point.
21:36:08 , End Params= ['l1', 29.998776300383945, 5, 12, 0.849105743501767, 2962] 2962
Iteration No:

Iteration No: 195 ended. Search finished for the next optimal point.
Time taken: 57.2858
Function value obtained: -0.9600
Current minimum: -0.9633
Iteration No: 196 started. Searching for the next optimal point.
22:13:03 , End Params= ['l2', 29.61450907561095, 4, 15, 0.572461148793464, 2856] 2856
Iteration No: 196 ended. Search finished for the next optimal point.
Time taken: 131.8405
Function value obtained: -0.9628
Current minimum: -0.9633
Iteration No: 197 started. Searching for the next optimal point.
22:14:57 , End Params= ['l1', 29.763965707643564, 5, 41, 0.22633461798500215, 2502] 2502
Iteration No: 197 ended. Search finished for the next optimal point.
Time taken: 114.3807
Function value obtained: -0.9539
Current minimum: -0.9633
Iteration No: 198 started. Searching for the next optimal point.
22:16:58 , End Params= ['l1', 28.751025281758373, 5, 28, 0.4639507132743982, 2271] 2271
Iteration No: 198 ended. Search finished for the next optimal point.
Time taken: 121.2729
Function 

22:46:35 , End Params= ['l1', 1.0473534196072245, 2, 21, 0.21005298371351788, 2775] 2775
Iteration No: 223 ended. Search finished for the next optimal point.
Time taken: 40.3317
Function value obtained: -0.9525
Current minimum: -0.9633
Iteration No: 224 started. Searching for the next optimal point.
22:48:35 , End Params= ['l1', 1.0054675872882122, 5, 29, 0.7739960893441777, 2764] 2764
Iteration No: 224 ended. Search finished for the next optimal point.
Time taken: 121.1364
Function value obtained: -0.9506
Current minimum: -0.9633
Iteration No: 225 started. Searching for the next optimal point.
22:50:35 , End Params= ['l1', 1.0125325930428895, 5, 28, 0.4126750855327491, 1681] 1681
Iteration No: 225 ended. Search finished for the next optimal point.
Time taken: 120.0553
Function value obtained: -0.9497
Current minimum: -0.9633
Iteration No: 226 started. Searching for the next optimal point.
22:52:31 , End Params= ['l1', 1.022091526917315, 5, 39, 0.532113185564828, 2737] 2737
Iteration N

Iteration No: 250 ended. Search finished for the next optimal point.
Time taken: 71.0436
Function value obtained: -0.9371
Current minimum: -0.9633
Iteration No: 251 started. Searching for the next optimal point.
23:28:41 , End Params= ['l2', 1.0208550654999455, 4, 24, 0.6957851917414317, 2901] 2901
Iteration No: 251 ended. Search finished for the next optimal point.
Time taken: 94.9753
Function value obtained: -0.9527
Current minimum: -0.9633
Iteration No: 252 started. Searching for the next optimal point.
23:29:26 , End Params= ['l2', 29.79678264579418, 2, 18, 0.5962754246300003, 2945] 2945
Iteration No: 252 ended. Search finished for the next optimal point.
Time taken: 44.3028
Function value obtained: -0.9623
Current minimum: -0.9633
Iteration No: 253 started. Searching for the next optimal point.
23:30:55 , End Params= ['l2', 29.657196595163757, 4, 29, 0.6799210687915054, 2895] 2895
Iteration No: 253 ended. Search finished for the next optimal point.
Time taken: 90.5025
Function val

00:03:15 , End Params= ['l1', 14.032846340579264, 4, 493, 0.20026680509250677, 2840] 1666
Iteration No: 278 ended. Search finished for the next optimal point.
Time taken: 70.9735
Function value obtained: -0.9376
Current minimum: -0.9633
Iteration No: 279 started. Searching for the next optimal point.
00:04:13 , End Params= ['l2', 27.726114013555293, 2, 14, 0.3431338916638484, 2833] 2833
Iteration No: 279 ended. Search finished for the next optimal point.
Time taken: 57.3328
Function value obtained: -0.9625
Current minimum: -0.9633
Iteration No: 280 started. Searching for the next optimal point.
00:05:09 , End Params= ['l1', 5.216356884145747, 3, 42, 0.2651046870364735, 2963] 2963
Iteration No: 280 ended. Search finished for the next optimal point.
Time taken: 56.4699
Function value obtained: -0.9598
Current minimum: -0.9633
Iteration No: 281 started. Searching for the next optimal point.
00:05:52 , End Params= ['l1', 16.37269936823681, 3, 496, 0.2003183925419174, 2881] 1647
Iteration N

00:36:12 , End Params= ['l1', 1.0044059788705824, 2, 17, 0.3165284642496777, 2621] 2621
Iteration No: 306 ended. Search finished for the next optimal point.
Time taken: 44.2789
Function value obtained: -0.9515
Current minimum: -0.9633
Iteration No: 307 started. Searching for the next optimal point.
00:38:07 , End Params= ['l2', 29.781425682293012, 5, 40, 0.8638035249062261, 2945] 2945
Iteration No: 307 ended. Search finished for the next optimal point.
Time taken: 116.1266
Function value obtained: -0.9620
Current minimum: -0.9633
Iteration No: 308 started. Searching for the next optimal point.
00:38:40 , End Params= ['l1', 4.368298882361206, 2, 43, 0.2689825097511225, 2891] 2891
Iteration No: 308 ended. Search finished for the next optimal point.
Time taken: 31.2839
Function value obtained: -0.9600
Current minimum: -0.9633
Iteration No: 309 started. Searching for the next optimal point.
00:39:39 , End Params= ['l2', 28.117403976169268, 2, 13, 0.5004348536800707, 2909] 2909
Iteration No

Iteration No: 333 ended. Search finished for the next optimal point.
Time taken: 55.2821
Function value obtained: -0.9589
Current minimum: -0.9633
Iteration No: 334 started. Searching for the next optimal point.
01:14:15 , End Params= ['l1', 22.477754030394358, 4, 494, 0.2227088442709139, 2211] 1679
Iteration No: 334 ended. Search finished for the next optimal point.
Time taken: 71.3691
Function value obtained: -0.9372
Current minimum: -0.9633
Iteration No: 335 started. Searching for the next optimal point.
01:16:14 , End Params= ['l2', 28.673022400726154, 5, 32, 0.8306268374846453, 2949] 2949
Iteration No: 335 ended. Search finished for the next optimal point.
Time taken: 119.1250
Function value obtained: -0.9624
Current minimum: -0.9633
Iteration No: 336 started. Searching for the next optimal point.
01:17:12 , End Params= ['l1', 3.9603507877990656, 3, 39, 0.20438103033606758, 2938] 2938
Iteration No: 336 ended. Search finished for the next optimal point.
Time taken: 56.9139
Function

01:57:43 , End Params= ['l1', 3.4549850162396036, 3, 441, 0.9457116461841373, 2863] 2002
Iteration No: 361 ended. Search finished for the next optimal point.
Time taken: 43.8719
Function value obtained: -0.9460
Current minimum: -0.9633
Iteration No: 362 started. Searching for the next optimal point.
01:58:05 , End Params= ['l1', 25.38510136649801, 2, 500, 0.8593424605366957, 2791] 1673
Iteration No: 362 ended. Search finished for the next optimal point.
Time taken: 21.6762
Function value obtained: -0.9409
Current minimum: -0.9633
Iteration No: 363 started. Searching for the next optimal point.
01:58:27 , End Params= ['l1', 3.811177230596857, 2, 414, 0.9463697546516778, 2735] 2058
Iteration No: 363 ended. Search finished for the next optimal point.
Time taken: 22.0080
Function value obtained: -0.9468
Current minimum: -0.9633
Iteration No: 364 started. Searching for the next optimal point.
01:59:23 , End Params= ['l1', 28.69307387188222, 3, 43, 0.7211346614050973, 2794] 2794
Iteration No

02:34:01 , End Params= ['l2', 28.639896654112682, 3, 25, 0.4129504706872709, 1638] 1638
Iteration No: 389 ended. Search finished for the next optimal point.
Time taken: 63.5347
Function value obtained: -0.9594
Current minimum: -0.9633
Iteration No: 390 started. Searching for the next optimal point.
02:36:20 , End Params= ['l1', 28.777369631298285, 5, 19, 0.3455934177218142, 2646] 2646
Iteration No: 390 ended. Search finished for the next optimal point.
Time taken: 140.8787
Function value obtained: -0.9550
Current minimum: -0.9633
Iteration No: 391 started. Searching for the next optimal point.
02:38:31 , End Params= ['l1', 4.160543525140023, 4, 15, 0.5760595375257628, 2931] 2931
Iteration No: 391 ended. Search finished for the next optimal point.
Time taken: 130.0806
Function value obtained: -0.9604
Current minimum: -0.9633
Iteration No: 392 started. Searching for the next optimal point.
02:42:12 , End Params= ['l2', 28.74737197893305, 5, 10, 0.2173059400733889, 2928] 2928
Iteration No

Iteration No: 416 ended. Search finished for the next optimal point.
Time taken: 55.5242
Function value obtained: -0.9522
Current minimum: -0.9633
Iteration No: 417 started. Searching for the next optimal point.
03:11:17 , End Params= ['l1', 4.11037228065301, 3, 17, 0.29460154059251276, 2665] 2665
Iteration No: 417 ended. Search finished for the next optimal point.
Time taken: 84.1744
Function value obtained: -0.9604
Current minimum: -0.9633
Iteration No: 418 started. Searching for the next optimal point.
03:13:27 , End Params= ['l2', 28.78769941580091, 3, 11, 0.8245599397804633, 2814] 2814
Iteration No: 418 ended. Search finished for the next optimal point.
Time taken: 130.4231
Function value obtained: -0.9628
Current minimum: -0.9633
Iteration No: 419 started. Searching for the next optimal point.
03:14:09 , End Params= ['l1', 3.7626644305518693, 2, 20, 0.6262023456570847, 2921] 2921
Iteration No: 419 ended. Search finished for the next optimal point.
Time taken: 41.8278
Function val

03:55:16 , End Params= ['l2', 28.125809104601167, 2, 14, 0.687145017167879, 2703] 2703
Iteration No: 444 ended. Search finished for the next optimal point.
Time taken: 61.2663
Function value obtained: -0.9622
Current minimum: -0.9633
Iteration No: 445 started. Searching for the next optimal point.
03:55:59 , End Params= ['l2', 22.617291872775613, 3, 492, 0.21042093002169437, 2888] 1672
Iteration No: 445 ended. Search finished for the next optimal point.
Time taken: 42.9682
Function value obtained: -0.9381
Current minimum: -0.9633
Iteration No: 446 started. Searching for the next optimal point.
03:57:08 , End Params= ['l1', 4.638203372381446, 2, 11, 0.22126432396429616, 2881] 2881
Iteration No: 446 ended. Search finished for the next optimal point.
Time taken: 69.3548
Function value obtained: -0.9597
Current minimum: -0.9633
Iteration No: 447 started. Searching for the next optimal point.
03:58:05 , End Params= ['l1', 28.64316569499626, 2, 15, 0.7979856688766385, 2085] 2085
Iteration No

Iteration No: 471 ended. Search finished for the next optimal point.
Time taken: 102.1234
Function value obtained: -0.9454
Current minimum: -0.9633
Iteration No: 472 started. Searching for the next optimal point.
04:40:46 , End Params= ['l2', 28.754935646978247, 2, 17, 0.24686756190428982, 2999] 2999
Iteration No: 472 ended. Search finished for the next optimal point.
Time taken: 45.7977
Function value obtained: -0.9625
Current minimum: -0.9633
Iteration No: 473 started. Searching for the next optimal point.
04:42:49 , End Params= ['l1', 4.282285444390646, 5, 25, 0.7851079196567978, 2954] 2954
Iteration No: 473 ended. Search finished for the next optimal point.
Time taken: 124.7777
Function value obtained: -0.9606
Current minimum: -0.9633
Iteration No: 474 started. Searching for the next optimal point.
04:43:20 , End Params= ['l1', 28.959933431309878, 2, 49, 0.4218539491943939, 2603] 2603
Iteration No: 474 ended. Search finished for the next optimal point.
Time taken: 30.0512
Function 

05:15:34 , End Params= ['l1', 4.256232663303429, 4, 32, 0.7806410457041173, 2722] 2722
Iteration No: 499 ended. Search finished for the next optimal point.
Time taken: 88.7625
Function value obtained: -0.9603
Current minimum: -0.9633
Iteration No: 500 started. Searching for the next optimal point.
05:16:38 , End Params= ['l2', 29.677419819185555, 2, 12, 0.42320464750689835, 2923] 2923
Iteration No: 500 ended. Search finished for the next optimal point.
Time taken: 63.9255
Function value obtained: -0.9625
Current minimum: -0.9633


NameError: name 'model_type' is not defined

### Logistic Regression best results - sorted

In [7]:
sorted_d = sorted(cache.items(), key=lambda x: x[1])
temp = []
for i in range(len(sorted_d)):
    temp.append((sorted_d[i][0], round(sorted_d[i][1],5)))
    print('{} {}'.format(round(sorted_d[i][1],5), sorted_d[i][0]))

-0.96329 ['l2', 16.900136712771832, 2, 20, 0.29938491561199543, 2962]
-0.96313 ['l2', 17.367160822540775, 5, 14, 0.22557157021271204, 2963]
-0.96313 ['l2', 15.817571222287645, 3, 18, 0.3462623630302616, 2981]
-0.96312 ['l2', 15.320165234167213, 2, 15, 0.37290580758312025, 2960]
-0.96309 ['l2', 17.263126172526906, 5, 26, 0.256111514846244, 2987]
-0.96309 ['l2', 19.67484640971566, 4, 22, 0.47584229177177884, 2995]
-0.96309 ['l2', 12.80072247980821, 2, 10, 0.27774484976536207, 2986]
-0.96308 ['l2', 13.575057971315202, 4, 40, 0.5738528399264179, 2972]
-0.96307 ['l2', 16.248303793342004, 4, 25, 0.2815360003152947, 2920]
-0.96305 ['l2', 15.224094037713582, 4, 27, 0.43054265162304795, 2972]
-0.96303 ['l2', 20.60850648817734, 5, 13, 0.4646762914127432, 2936]
-0.963 ['l2', 15.150914423039172, 4, 15, 0.3360924616190115, 2886]
-0.96299 ['l2', 16.610098772374403, 4, 19, 0.6257681970011683, 2946]
-0.96298 ['l2', 9.947248574481454, 3, 37, 0.2629734047218675, 2918]
-0.96297 ['l2', 15.034121846061778,

## Find any stop words that improve AUC

In [25]:
from nltk.corpus import stopwords
np.random.seed(seed=seed)
words = ['FUCKING_BASELINE']
words.extend(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "their", "they",
             "his", "her", "she", "he", "a", "an", "and", "is", "was", "are", "were", "him", "himself", "has",
             "have", "it", "its", "of", "one", "for", "the", "us", "this"])
words.extend(stop_words.ENGLISH_STOP_WORDS)
fold=0
train = trains[fold].copy()
test  = tests[fold].copy()
train['review'] =  revs.review.str.replace('<br /><br />',' ')
test ['review'] =  revs.review.str.replace('<br /><br />',' ')
y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')
y_train = train.sentiment

for word in words:
    start_time = time.time()

    cv = TfidfVectorizer(stop_words=[word,], ngram_range=(1,2), min_df=20, max_df=0.3)  
    X_train = cv.fit_transform(train.review).toarray()
    X_test  = cv.transform(test.review).toarray()
    
    vocab = np.array(cv.get_feature_names())
    t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])
    
    voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
    voc_df['magn_tstat'] = voc_df.tstat.abs()
    voc_df = voc_df.sort_values('magn_tstat',ascending=False)
    vocab_slim = voc_df.word.values[0:2900]

    indices = np.where(np.in1d(vocab, vocab_slim))[0]
    X_train = X_train[:, indices]
    X_test  = X_test [:, indices]

    model = LogisticRegression(penalty='l2',C=17, random_state=seed)
    _ = model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:,1]
    print('Split:{}, AUC:{:<7.5}, Vocab:{}, RunTime:{:6.2f} secs, {}'.format(
        fold,round(roc_auc_score(y_test.sentiment, probs),5), X_train.shape[1], round(time.time()-start_time,2), word))

Split:0, AUC:0.96303, Vocab:2900, RunTime: 50.53 secs, NO_FUCKING_IMPROVEMENT
Split:0, AUC:0.96303, Vocab:2900, RunTime: 49.34 secs, i
Split:0, AUC:0.96309, Vocab:2900, RunTime: 50.17 secs, me
Split:0, AUC:0.96309, Vocab:2900, RunTime: 50.06 secs, my
Split:0, AUC:0.96306, Vocab:2900, RunTime: 50.05 secs, myself
Split:0, AUC:0.96305, Vocab:2900, RunTime: 50.76 secs, we
Split:0, AUC:0.96301, Vocab:2900, RunTime: 46.32 secs, our
Split:0, AUC:0.96303, Vocab:2900, RunTime: 46.67 secs, ours
Split:0, AUC:0.96303, Vocab:2900, RunTime: 48.65 secs, ourselves
Split:0, AUC:0.96301, Vocab:2900, RunTime: 50.11 secs, you
Split:0, AUC:0.96309, Vocab:2900, RunTime: 47.10 secs, your
Split:0, AUC:0.96303, Vocab:2900, RunTime: 49.85 secs, yours
Split:0, AUC:0.96304, Vocab:2900, RunTime: 48.91 secs, their
Split:0, AUC:0.96307, Vocab:2900, RunTime: 49.52 secs, they
Split:0, AUC:0.96315, Vocab:2900, RunTime: 49.80 secs, his
Split:0, AUC:0.96307, Vocab:2900, RunTime: 49.40 secs, her
Split:0, AUC:0.96308, Voca

Split:0, AUC:0.96285, Vocab:2900, RunTime: 49.38 secs, see
Split:0, AUC:0.96298, Vocab:2900, RunTime: 49.40 secs, that
Split:0, AUC:0.96305, Vocab:2900, RunTime: 49.05 secs, whatever
Split:0, AUC:0.96302, Vocab:2900, RunTime: 48.63 secs, sometimes
Split:0, AUC:0.96307, Vocab:2900, RunTime: 48.46 secs, her
Split:0, AUC:0.96299, Vocab:2900, RunTime: 49.87 secs, below
Split:0, AUC:0.96307, Vocab:2900, RunTime: 48.88 secs, last
Split:0, AUC:0.96301, Vocab:2900, RunTime: 48.56 secs, more
Split:0, AUC:0.96303, Vocab:2900, RunTime: 48.90 secs, beside
Split:0, AUC:0.96303, Vocab:2900, RunTime: 49.81 secs, four
Split:0, AUC:0.9631 , Vocab:2900, RunTime: 49.34 secs, out
Split:0, AUC:0.96304, Vocab:2900, RunTime: 49.59 secs, de
Split:0, AUC:0.96303, Vocab:2900, RunTime: 49.24 secs, latter
Split:0, AUC:0.96307, Vocab:2900, RunTime: 61.69 secs, name
Split:0, AUC:0.96304, Vocab:2900, RunTime: 92.60 secs, nowhere
Split:0, AUC:0.96305, Vocab:2900, RunTime: 73.28 secs, already
Split:0, AUC:0.96303, Voc

Split:0, AUC:0.96303, Vocab:2900, RunTime: 51.25 secs, therefore
Split:0, AUC:0.96303, Vocab:2900, RunTime: 52.07 secs, whereby
Split:0, AUC:0.96306, Vocab:2900, RunTime: 51.18 secs, former
Split:0, AUC:0.96304, Vocab:2900, RunTime: 53.07 secs, interest
Split:0, AUC:0.96303, Vocab:2900, RunTime: 50.15 secs, yet
Split:0, AUC:0.96315, Vocab:2900, RunTime: 51.98 secs, other
Split:0, AUC:0.96305, Vocab:2900, RunTime: 50.78 secs, otherwise
Split:0, AUC:0.96309, Vocab:2900, RunTime: 51.71 secs, your
Split:0, AUC:0.96307, Vocab:2900, RunTime: 51.21 secs, between
Split:0, AUC:0.96304, Vocab:2900, RunTime: 53.30 secs, themselves
Split:0, AUC:0.96304, Vocab:2900, RunTime: 51.97 secs, towards
Split:0, AUC:0.96309, Vocab:2900, RunTime: 53.30 secs, up
Split:0, AUC:0.96303, Vocab:2900, RunTime: 53.91 secs, noone
Split:0, AUC:0.96297, Vocab:2900, RunTime: 51.62 secs, at
Split:0, AUC:0.96283, Vocab:2900, RunTime: 52.30 secs, still
Split:0, AUC:0.96303, Vocab:2900, RunTime: 50.83 secs, thereafter
Split

# BELOW HERE IS MISC STUFF