In [0]:
import numpy as np
import string

In [0]:
!pip install sklearn
from sklearn.model_selection import train_test_split



In [0]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-03-01 13:50:36--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.238.181
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.238.181|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [0]:
!pip install gensim
from gensim.models import KeyedVectors



In [0]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
word2vec["cat"].shape

(300,)

In [0]:
def cos(x1, x2):
  return np.dot(x1, x2)/(np.linalg.norm(x1)*np.linalg.norm(x2))

# Word Similarity

In [0]:
print(cos(word2vec["happy"],word2vec["sad"]))
print(cos(word2vec["male"],word2vec["female"]))
print(cos(word2vec["charisma"],word2vec["beauty"]))
print(cos(word2vec["black"],word2vec["color"]))
print(cos(word2vec["enigma"],word2vec["enigmatic"]))



0.5354614
0.8405335
0.28950197
0.44559646
0.6158348


# Word Analogy


In [0]:
temp = word2vec["brother"]-word2vec["male"]+word2vec["female"]
cos(temp,word2vec["sister"])

0.70223486

In [0]:
temp = word2vec["Delhi"]-word2vec["India"]+word2vec["Germany"]
cos(temp,word2vec["Berlin"])

0.72111446

In [0]:
temp = word2vec["king"]-word2vec["male"]+word2vec["female"]
cos(temp,word2vec["queen"])

0.66696125

In [0]:
temp = word2vec["orange"]-word2vec["red"]+word2vec["blue"]
cos(temp,word2vec["yellow"])

0.45128784

In [0]:
temp = word2vec["airhostess"]-word2vec["female"]+word2vec["male"]
cos(temp,word2vec["steward"])

0.2178484

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports and Data loading

In [0]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
data = pd.read_csv('/content/drive/My Drive/NLP/Tweets.csv')

In [0]:
label = data['airline_sentiment'].values
text  = data['text'].values

In [0]:
for i in range(len(label)):
  if label[i]=='neutral':
    label[i]=0
  if label[i]=='positive':
    label[i]=1
  if label[i]=='negative':
    label[i]=-1

# Data Preprocessing

In [0]:
def preprocess_tweet(text):

    # Check characters to see if they are in punctuation
    nopunc = re.sub('\'ve',' ',text)                            # expanding the contractions
    nopunc = re.sub('\'nt',' ',nopunc)
    nopunc = re.sub('\'d',' ',nopunc)                             # expanding the contractions
    nopunc = re.sub(' \d+',' ',nopunc)
    nopunc = [char for char in nopunc if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    # convert text to lower-case
    nopunc = nopunc.lower()
    # remove URLs
    nopunc = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', nopunc)
    nopunc = re.sub(r'http\S+', '', nopunc)
    # remove usernames
    nopunc = re.sub('@[^\s]+', '', nopunc)
    # remove the # in #hashtag
    
    nopunc = re.sub(r'#([^\s]+)', r'\1', nopunc)
     
       
    # remove repeated characters
    nopunc = word_tokenize(nopunc)
    # remove stopwords from final word list
    nopunc =  [word for word in nopunc if word not in stopwords.words('english')]
    nopunc = [word for word in nopunc if word in word2vec]
    return(nopunc)

In [0]:
processed_text = []
for tweet in text:
  processed_text.append(preprocess_tweet(tweet))

# Embedding for a tweet

In [0]:
avg_words = []
for tweet in processed_text:
  avg = np.zeros([300])
  for word in tweet:
    avg = avg+word2vec[word]
  avg = avg/300
  avg_words.append(avg)



# Train and Test using Logistic Regression

In [0]:
x_train, x_test, y_train, y_test = train_test_split(avg_words, label, test_size=0.2, random_state=42)

In [0]:
from sklearn.linear_model import LogisticRegression
y_train=y_train.astype('int')
clf = LogisticRegression(random_state=0).fit(x_train, y_train)

In [0]:
y_test=y_test.astype('int')
clf.score(x_test,y_test)

0.6451502732240437

# Word2Vec Embedding using Gensim

In [0]:
tweets = [tweet for tweet in processed_text]

In [0]:
print(tweets)



In [0]:
from gensim import models
model = models.Word2Vec(tweets,size=300,min_count=1) 

In [0]:
len((list(model.wv.vocab)))

8763

In [0]:
avg_words = []
for tweet in processed_text:
  avg = np.zeros([300])
  for word in tweet:
    avg = avg+model[word]
  avg = avg/300
  avg_words.append(avg)

  """


# Train and Test on the Gensim Embeddings

In [0]:
x_train, x_test, y_train, y_test = train_test_split(avg_words, label, test_size=0.2, random_state=42)

In [0]:
from sklearn.linear_model import LogisticRegression
y_train=y_train.astype('int')
clf = LogisticRegression(random_state=0).fit(x_train, y_train)

In [0]:
y_test=y_test.astype('int')
clf.score(x_test,y_test)

0.6557377049180327