<a href="https://www.kaggle.com/code/gopimali/pretrained-and-fine-tunned?scriptVersionId=131617343" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Importing

In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from unidecode import unidecode
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords

In [2]:
path = '/kaggle/input/nlp-model/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(path,binary=True)
data = pd.read_csv('/kaggle/input/nlp-task/Train_nlp.csv')

In [3]:
# Checking 
model['good']

array([ 0.04052734,  0.0625    , -0.01745605,  0.07861328,  0.03271484,
       -0.01263428,  0.00964355,  0.12353516, -0.02148438,  0.15234375,
       -0.05834961, -0.10644531,  0.02124023,  0.13574219, -0.13183594,
        0.17675781,  0.27148438,  0.13769531, -0.17382812, -0.14160156,
       -0.03076172,  0.19628906, -0.03295898,  0.125     ,  0.25390625,
        0.12695312, -0.15234375,  0.03198242,  0.01135254, -0.01361084,
       -0.12890625,  0.01019287,  0.23925781, -0.08447266,  0.140625  ,
        0.13085938, -0.04516602,  0.06494141,  0.02539062,  0.05615234,
        0.24609375, -0.20507812,  0.23632812, -0.00860596, -0.02294922,
        0.05078125,  0.10644531, -0.03564453,  0.08740234, -0.05712891,
        0.08496094,  0.23535156, -0.10107422, -0.03564453, -0.04736328,
        0.04736328, -0.14550781, -0.10986328,  0.14746094, -0.23242188,
       -0.07275391,  0.19628906, -0.37890625, -0.07226562,  0.04833984,
        0.11914062,  0.06103516, -0.12109375, -0.27929688,  0.05

In [4]:
data.head(2)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [5]:
stopword_list = stopwords.words("english")
stopword_list.remove("no")
stopword_list.remove("nor")
stopword_list.remove("not")

def clean_text(data):
    tokens = word_tokenize(data)
    clean_data = [word.lower() for word in tokens if (word.lower() not in punctuation) and (word.lower() not in stopword_list) and ( len(word)>2) and (word.isalpha()) ]
    return clean_data

tokens = data.text.apply(clean_text)

In [6]:
data['tokens'] = tokens

In [7]:
data.head()

Unnamed: 0,text,label,tokens
0,I grew up (b. 1965) watching and loving the Th...,0,"[grew, watching, loving, thunderbirds, mates, ..."
1,"When I put this movie in my DVD player, and sa...",0,"[put, movie, dvd, player, sat, coke, chips, ex..."
2,Why do people who do not know what a particula...,0,"[people, not, know, particular, time, past, li..."
3,Even though I have great interest in Biblical ...,0,"[even, though, great, interest, biblical, movi..."
4,Im a die hard Dads Army fan and nothing will e...,1,"[die, hard, dads, army, fan, nothing, ever, ch..."


# Vector with pretrained

In [8]:
vector_with_pretrained = []
pretrained_key_error_words = []
for t in range(len(tokens)):
    vector_list = []
    for word in tokens[t]:
        try:
            vector = model[word]
            vector_list.append(vector)
        except KeyError:
            pretrained_key_error_words.append(word)
    final_token = np.mean(vector_list, axis = 0)
    vector_with_pretrained.append(final_token)
    

In [9]:
print(pretrained_key_error_words[0:10])

['virgil', 'jonatha', 'frakes', 'videoshop', 'isnt', 'didnt', 'moonlanding', 'cheque', 'frazer', 'mcnealy']


In [10]:
data['vector_with_pretrained'] = vector_with_pretrained

In [11]:
data.head()

Unnamed: 0,text,label,tokens,vector_with_pretrained
0,I grew up (b. 1965) watching and loving the Th...,0,"[grew, watching, loving, thunderbirds, mates, ...","[0.060559157, 0.039513286, 0.015592435, 0.1114..."
1,"When I put this movie in my DVD player, and sa...",0,"[put, movie, dvd, player, sat, coke, chips, ex...","[0.06501582, 0.008160873, -0.008187361, 0.1270..."
2,Why do people who do not know what a particula...,0,"[people, not, know, particular, time, past, li...","[0.059786648, 0.033370122, 0.01162423, 0.12186..."
3,Even though I have great interest in Biblical ...,0,"[even, though, great, interest, biblical, movi...","[0.080684885, 0.021379359, -0.02830236, 0.1350..."
4,Im a die hard Dads Army fan and nothing will e...,1,"[die, hard, dads, army, fan, nothing, ever, ch...","[0.034482926, 0.054618753, 0.008420254, 0.0965..."


# Fine tunned

In [12]:
from gensim.models import Word2Vec

In [13]:
final_text = tokens.to_list()

In [14]:
model = Word2Vec(final_text ,window=3,min_count=2)

In [15]:
model.wv['good']

array([-0.29020602,  0.871615  , -0.8307397 , -1.4987825 , -1.2072297 ,
       -0.2171255 ,  0.7275151 ,  0.6103388 , -0.36895272, -3.1575181 ,
       -0.8515453 ,  0.17948192,  0.37025884,  0.3676429 , -0.5695031 ,
       -0.5489261 ,  1.1414595 ,  0.8080062 , -0.3111966 , -0.5921156 ,
        0.18192005,  0.29498798,  1.0951381 ,  0.03203284, -1.4383956 ,
        1.4150347 , -0.8027472 , -0.49348432,  1.4634529 , -0.09176354,
       -0.31670898, -1.155005  ,  0.19692296, -1.8370125 , -0.21931294,
        2.0189657 ,  1.1337094 ,  0.03166764, -0.02459771, -1.1557428 ,
        0.81724685, -0.612084  , -0.5001596 ,  1.5041897 , -0.5847036 ,
       -0.05582507,  0.7128732 ,  0.9045984 , -0.22230066, -1.5351458 ,
       -0.3281418 , -2.2958775 ,  0.29740545,  0.26455325, -0.26248088,
        0.8546346 ,  1.0754222 , -0.35884762, -0.12055074, -1.7066541 ,
        0.20607725, -1.2332307 ,  2.9310706 , -0.04408836,  0.7301387 ,
        1.3985496 , -0.8054754 ,  0.74575025, -1.7820895 , -0.34

In [16]:
vector_with_finetunned = []
fine_tunned_key_error_words = []
for t in range(len(tokens)):
    vector_list = []
    for word in tokens[t]:
        try:
            vector = model.wv[word]
            vector_list.append(vector)
        except KeyError :
            fine_tunned_key_error_words.append(word)
    final_token = np.mean(vector_list, axis = 0)
    vector_with_finetunned.append(final_token)

In [17]:
print(fine_tunned_key_error_words[0:10])

['jonatha', 'subsp', 'videoshop', 'sniggered', 'moonlanding', 'ipso', 'audiobooks', 'microsystem', 'okazaki', 'quaalude']


In [18]:
data['vector_with_finetunned'] = vector_with_finetunned

In [19]:
data.head()

Unnamed: 0,text,label,tokens,vector_with_pretrained,vector_with_finetunned
0,I grew up (b. 1965) watching and loving the Th...,0,"[grew, watching, loving, thunderbirds, mates, ...","[0.060559157, 0.039513286, 0.015592435, 0.1114...","[-0.27686617, 0.22647858, -0.111960195, 0.0603..."
1,"When I put this movie in my DVD player, and sa...",0,"[put, movie, dvd, player, sat, coke, chips, ex...","[0.06501582, 0.008160873, -0.008187361, 0.1270...","[-0.5549633, 0.34935123, -0.30282107, 0.162067..."
2,Why do people who do not know what a particula...,0,"[people, not, know, particular, time, past, li...","[0.059786648, 0.033370122, 0.01162423, 0.12186...","[-0.5822421, 0.33170575, -0.072056666, 0.05991..."
3,Even though I have great interest in Biblical ...,0,"[even, though, great, interest, biblical, movi...","[0.080684885, 0.021379359, -0.02830236, 0.1350...","[-0.5075, 0.5159911, -0.12316914, 0.03261719, ..."
4,Im a die hard Dads Army fan and nothing will e...,1,"[die, hard, dads, army, fan, nothing, ever, ch...","[0.034482926, 0.054618753, 0.008420254, 0.0965...","[-0.46815112, 0.24544959, -0.24271873, -0.0865..."
