In [129]:
import pandas as pd

# Assign custom column names 'label' and 'message'
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'message'])

print(messages.head())  # Display the first few rows


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [130]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [80]:
messages.shape

(5572, 2)

In [81]:
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [82]:
#data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps=PorterStemmer()


In [131]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [132]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

# BAG OF WORDS

In [133]:
#creating a Bag of words Model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,binary=True,ngram_range=(2,2))
X=cv.fit_transform(corpus).toarray()

In [134]:
X[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [135]:
X.shape

(5572, 2500)

In [139]:
y=pd.get_dummies(messages['label']) #doing one hot encoding on our label part
y=y.iloc[:,1].astype(int).values #we are taking only one column

In [140]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [141]:
#train test split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [142]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [93]:
# prediction
y_pred=spam_detect_model.predict(X_test)

In [143]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)
print(score)

0.9811659192825112


In [144]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.87      0.93       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# TFIDF Model

In [145]:
#Creating TFIDF model

from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X=tv.fit_transform(corpus).toarray()

In [146]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [147]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model=MultinomialNB().fit(X_train,y_train)

In [148]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [149]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)
print(score)

0.97847533632287


In [150]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.85      0.92       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



# WORD TO VEC
1.skipgram
2.CBOW

In [102]:
! pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [103]:
import gensim.downloader as api
wv=api.load('word2vec-google-news-300')

In [151]:
vec_king=wv['king']

In [152]:
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [153]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [154]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [155]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [109]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [110]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [111]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [112]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [113]:
import gensim

In [156]:
### lets train Word2vec form sctratch
import gensim
model=gensim.models.Word2Vec(words,window=5,min_count=2)

In [157]:
model.wv.index_to_key #all vocbulary

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'day',
 'ok',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'love',
 'text',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'please',
 'later',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'great',
 'oh',
 'hope',
 'claim',
 'hey',
 'number',
 'give',
 'happy',
 'work',
 'wat',
 'friend',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'tone',
 'win',
 'ask',
 'said',
 'cash',
 'life',
 'amp',
 'im',
 'yeah',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'service',
 'year',
 'thanks',
 'uk',
 'last',
 'would',
 'anything',
 'com',
 'care',
 'lol',
 'nokia',
 'also',
 'feel',
 'every',
 'keep',
 'pick',
 'sure',
 'sent',
 'urgent',
 'contact',


In [158]:
model.corpus_count #total vocab count


5564

In [159]:
model.wv.similar_by_word('prize')

[('claim', 0.9993420243263245),
 ('line', 0.9992696046829224),
 ('call', 0.9992576241493225),
 ('cash', 0.9991783499717712),
 ('draw', 0.999076247215271),
 ('show', 0.9990610480308533),
 ('guaranteed', 0.9990337491035461),
 ('free', 0.9989952445030212),
 ('mobile', 0.9989609122276306),
 ('contact', 0.9989234805107117)]

In [160]:
model.wv.similar_by_word('happy')

[('special', 0.9995012283325195),
 ('day', 0.9994844794273376),
 ('year', 0.9993968605995178),
 ('life', 0.9993955492973328),
 ('dear', 0.9993853569030762),
 ('make', 0.9993680715560913),
 ('got', 0.9993674755096436),
 ('give', 0.9993667006492615),
 ('one', 0.999360978603363),
 ('amp', 0.9993578195571899)]

In [161]:
import numpy as np
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [120]:
! pip install tqdm

Defaulting to user installation because normal site-packages is not writeable


In [162]:
from tqdm import tqdm

In [163]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [123]:
type(model.wv.index_to_key)

list

In [164]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    print("Hello",i)
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 15%|█▌        | 856/5564 [00:00<00:00, 8550.75it/s]

Hello 0
Hello 1
Hello 2
Hello 3
Hello 4
Hello 5
Hello 6
Hello 7
Hello 8
Hello 9
Hello 10
Hello 11
Hello 12
Hello 13
Hello 14
Hello 15
Hello 16
Hello 17
Hello 18
Hello 19
Hello 20
Hello 21
Hello 22
Hello 23
Hello 24
Hello 25
Hello 26
Hello 27
Hello 28
Hello 29
Hello 30
Hello 31
Hello 32
Hello 33
Hello 34
Hello 35
Hello 36
Hello 37
Hello 38
Hello 39
Hello 40
Hello 41
Hello 42
Hello 43
Hello 44
Hello 45
Hello 46
Hello 47
Hello 48
Hello 49
Hello 50
Hello 51
Hello 52
Hello 53
Hello 54
Hello 55
Hello 56
Hello 57
Hello 58
Hello 59
Hello 60
Hello 61
Hello 62
Hello 63
Hello 64
Hello 65
Hello 66
Hello 67
Hello 68
Hello 69
Hello 70
Hello 71
Hello 72
Hello 73
Hello 74
Hello 75
Hello 76
Hello 77
Hello 78
Hello 79
Hello 80
Hello 81
Hello 82
Hello 83
Hello 84
Hello 85
Hello 86
Hello 87
Hello 88
Hello 89
Hello 90
Hello 91
Hello 92
Hello 93
Hello 94
Hello 95
Hello 96
Hello 97
Hello 98
Hello 99
Hello 100
Hello 101
Hello 102
Hello 103
Hello 104
Hello 105
Hello 106
Hello 107
Hello 108
Hello 109
Hello 110


 46%|████▌     | 2536/5564 [00:00<00:00, 8107.93it/s]

Hello 1585
Hello 1586
Hello 1587
Hello 1588
Hello 1589
Hello 1590
Hello 1591
Hello 1592
Hello 1593
Hello 1594
Hello 1595
Hello 1596
Hello 1597
Hello 1598
Hello 1599
Hello 1600
Hello 1601
Hello 1602
Hello 1603
Hello 1604
Hello 1605
Hello 1606
Hello 1607
Hello 1608
Hello 1609
Hello 1610
Hello 1611
Hello 1612
Hello 1613
Hello 1614
Hello 1615
Hello 1616
Hello 1617
Hello 1618
Hello 1619
Hello 1620
Hello 1621
Hello 1622
Hello 1623
Hello 1624
Hello 1625
Hello 1626
Hello 1627
Hello 1628
Hello 1629
Hello 1630
Hello 1631
Hello 1632
Hello 1633
Hello 1634
Hello 1635
Hello 1636
Hello 1637
Hello 1638
Hello 1639
Hello 1640
Hello 1641
Hello 1642
Hello 1643
Hello 1644
Hello 1645
Hello 1646
Hello 1647
Hello 1648
Hello 1649
Hello 1650
Hello 1651
Hello 1652
Hello 1653
Hello 1654
Hello 1655
Hello 1656
Hello 1657
Hello 1658
Hello 1659
Hello 1660
Hello 1661
Hello 1662
Hello 1663
Hello 1664
Hello 1665
Hello 1666
Hello 1667
Hello 1668
Hello 1669
Hello 1670
Hello 1671
Hello 1672
Hello 1673
Hello 1674
Hello 1675

 76%|███████▌  | 4203/5564 [00:00<00:00, 7050.83it/s]

Hello 3283
Hello 3284
Hello 3285
Hello 3286
Hello 3287
Hello 3288
Hello 3289
Hello 3290
Hello 3291
Hello 3292
Hello 3293
Hello 3294
Hello 3295
Hello 3296
Hello 3297
Hello 3298
Hello 3299
Hello 3300
Hello 3301
Hello 3302
Hello 3303
Hello 3304
Hello 3305
Hello 3306
Hello 3307
Hello 3308
Hello 3309
Hello 3310
Hello 3311
Hello 3312
Hello 3313
Hello 3314
Hello 3315
Hello 3316
Hello 3317
Hello 3318
Hello 3319
Hello 3320
Hello 3321
Hello 3322
Hello 3323
Hello 3324
Hello 3325
Hello 3326
Hello 3327
Hello 3328
Hello 3329
Hello 3330
Hello 3331
Hello 3332
Hello 3333
Hello 3334
Hello 3335
Hello 3336
Hello 3337
Hello 3338
Hello 3339
Hello 3340
Hello 3341
Hello 3342
Hello 3343
Hello 3344
Hello 3345
Hello 3346
Hello 3347
Hello 3348
Hello 3349
Hello 3350
Hello 3351
Hello 3352
Hello 3353
Hello 3354
Hello 3355
Hello 3356
Hello 3357
Hello 3358
Hello 3359
Hello 3360
Hello 3361
Hello 3362
Hello 3363
Hello 3364
Hello 3365
Hello 3366
Hello 3367
Hello 3368
Hello 3369
Hello 3370
Hello 3371
Hello 3372
Hello 3373

100%|██████████| 5564/5564 [00:00<00:00, 7261.51it/s]


Hello 4529
Hello 4530
Hello 4531
Hello 4532
Hello 4533
Hello 4534
Hello 4535
Hello 4536
Hello 4537
Hello 4538
Hello 4539
Hello 4540
Hello 4541
Hello 4542
Hello 4543
Hello 4544
Hello 4545
Hello 4546
Hello 4547
Hello 4548
Hello 4549
Hello 4550
Hello 4551
Hello 4552
Hello 4553
Hello 4554
Hello 4555
Hello 4556
Hello 4557
Hello 4558
Hello 4559
Hello 4560
Hello 4561
Hello 4562
Hello 4563
Hello 4564
Hello 4565
Hello 4566
Hello 4567
Hello 4568
Hello 4569
Hello 4570
Hello 4571
Hello 4572
Hello 4573
Hello 4574
Hello 4575
Hello 4576
Hello 4577
Hello 4578
Hello 4579
Hello 4580
Hello 4581
Hello 4582
Hello 4583
Hello 4584
Hello 4585
Hello 4586
Hello 4587
Hello 4588
Hello 4589
Hello 4590
Hello 4591
Hello 4592
Hello 4593
Hello 4594
Hello 4595
Hello 4596
Hello 4597
Hello 4598
Hello 4599
Hello 4600
Hello 4601
Hello 4602
Hello 4603
Hello 4604
Hello 4605
Hello 4606
Hello 4607
Hello 4608
Hello 4609
Hello 4610
Hello 4611
Hello 4612
Hello 4613
Hello 4614
Hello 4615
Hello 4616
Hello 4617
Hello 4618
Hello 4619

In [165]:
type(X)

list

In [170]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [171]:
X[0]

array([-0.08774913,  0.25675806,  0.02067648,  0.04188378,  0.03981001,
       -0.35555616,  0.09490886,  0.52800256, -0.24060416, -0.12325304,
       -0.13757092, -0.31769344, -0.00414256,  0.11912688,  0.08669674,
       -0.2412651 ,  0.02137314, -0.3413624 ,  0.00895688, -0.54256517,
        0.18082367,  0.13926126,  0.12277317, -0.10961983, -0.17750876,
        0.0444521 , -0.17021191, -0.16345058, -0.25358775,  0.01292655,
        0.30277717,  0.02608819,  0.07103701, -0.13099909, -0.10324913,
        0.23211579,  0.08031177, -0.20113625, -0.15909605, -0.48359764,
       -0.00365404, -0.23505628, -0.06023689,  0.01228313,  0.21958973,
       -0.04613692, -0.1668754 , -0.0345971 ,  0.09904519,  0.16995616,
        0.11293384, -0.18477114, -0.02605265, -0.03953466, -0.1197964 ,
        0.10552099,  0.18883656, -0.03261493, -0.337032  ,  0.07989167,
        0.08525756,  0.21155195, -0.1791838 , -0.04147051, -0.30161044,
        0.1315917 ,  0.09010618,  0.17413945, -0.3309895 ,  0.33

In [172]:
X

[array([-0.08774913,  0.25675806,  0.02067648,  0.04188378,  0.03981001,
        -0.35555616,  0.09490886,  0.52800256, -0.24060416, -0.12325304,
        -0.13757092, -0.31769344, -0.00414256,  0.11912688,  0.08669674,
        -0.2412651 ,  0.02137314, -0.3413624 ,  0.00895688, -0.54256517,
         0.18082367,  0.13926126,  0.12277317, -0.10961983, -0.17750876,
         0.0444521 , -0.17021191, -0.16345058, -0.25358775,  0.01292655,
         0.30277717,  0.02608819,  0.07103701, -0.13099909, -0.10324913,
         0.23211579,  0.08031177, -0.20113625, -0.15909605, -0.48359764,
        -0.00365404, -0.23505628, -0.06023689,  0.01228313,  0.21958973,
        -0.04613692, -0.1668754 , -0.0345971 ,  0.09904519,  0.16995616,
         0.11293384, -0.18477114, -0.02605265, -0.03953466, -0.1197964 ,
         0.10552099,  0.18883656, -0.03261493, -0.337032  ,  0.07989167,
         0.08525756,  0.21155195, -0.1791838 , -0.04147051, -0.30161044,
         0.1315917 ,  0.09010618,  0.17413945, -0.3

In [173]:
X[5]

array([-0.09746905,  0.28652424,  0.02533703,  0.05290459,  0.04852077,
       -0.39835843,  0.10696878,  0.59327775, -0.26955107, -0.13610362,
       -0.1547965 , -0.35928795, -0.00124746,  0.13277125,  0.09856978,
       -0.27256605,  0.0209736 , -0.3820945 ,  0.00718689, -0.61183846,
        0.20551935,  0.15582837,  0.13645487, -0.12739068, -0.20030555,
        0.04680204, -0.19104478, -0.18378963, -0.2881008 ,  0.01623211,
        0.34391773,  0.03015012,  0.08149822, -0.14693172, -0.11081085,
        0.25962532,  0.08635655, -0.22863917, -0.18238248, -0.54956996,
       -0.00533935, -0.26728404, -0.0685292 ,  0.01572502,  0.24857713,
       -0.05238121, -0.18186705, -0.03917395,  0.10878037,  0.1899123 ,
        0.1228256 , -0.20795289, -0.03296803, -0.04479235, -0.13049205,
        0.12176393,  0.21074502, -0.03206192, -0.37587228,  0.08790491,
        0.09746434,  0.24009593, -0.20660229, -0.04222783, -0.33852017,
        0.1533654 ,  0.10105471,  0.20031269, -0.3746679 ,  0.38

In [None]:
#assignment
# #train test split
#apply the model

In [None]:
#also data datset for another assignment
#imdb dataset of 50k reviews