In [2]:
import re, nltk, gensim
import pandas as pd
from nltk.corpus import stopwords
from gensim.models import Word2Vec

In [3]:
df = pd.read_csv('data/airline-sentiment.csv',  encoding = 'unicode_escape')

In [4]:
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


Preprocess the text to do the following:

-Normalize every word to lower case.

-Remove punctuation and retain only numbers and alphabets.

-Remove stop words

In [5]:
stop = set(stopwords.words('english'))

def preprocess(text):
    text=text.lower()
    text=re.sub('[^0-9a-z]+',' ',text)
    split = text.split()                                # splits the string by the default delimiter (space) into an array of n length where n is len(string)
    stopped = [i for i in split if i not in stop]       # removes stopwords in the split string array
    joined=' '.join(stopped)                            # rejoins the array elements into a string
    return(joined)

In [10]:
df['text'] = df['text'].apply(preprocess)

In [11]:
df[["text"]]

Unnamed: 0,text
0,virginamerica dhepburn said
1,virginamerica plus added commercials experienc...
2,virginamerica today must mean need take anothe...
3,virginamerica really aggressive blast obnoxiou...
4,virginamerica really big bad thing
...,...
14635,americanair thank got different flight chicago
14636,americanair leaving 20 minutes late flight war...
14637,americanair please bring american airlines bla...
14638,americanair money change flight answer phones ...


In [12]:
df = df[["airline_sentiment", "text"]]

In [15]:
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn said
1,positive,virginamerica plus added commercials experienc...
2,neutral,virginamerica today must mean need take anothe...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing


Create a list of words similar to the TFIDF exercise

In [16]:
wordlist=[]
for i in range(len(df)):
    wordlist.append(df['text'][i].split())

In [18]:
wordlist[:5]

[['virginamerica', 'dhepburn', 'said'],
 ['virginamerica', 'plus', 'added', 'commercials', 'experience', 'tacky'],
 ['virginamerica', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip'],
 ['virginamerica',
  'really',
  'aggressive',
  'blast',
  'obnoxious',
  'entertainment',
  'guests',
  'faces',
  'amp',
  'little',
  'recourse'],
 ['virginamerica', 'really', 'big', 'bad', 'thing']]

### Training Time

Build the Word2Vec model. Define the vector size, context window size to look into, and the minimum count of a word for it to be eligible to have a word vector
- size represents the size (dimension) of word vectors.
- window represents the context size of words that would be considered.
- min_count specifies the minimum frequency based on which a word is considered.
- sg represents whether skip-gram used (when sg=1) or CBOW (when sg = 0) used.
- alpha is the learning rate (which we'll discuss next week on neural nets proper)

```
Other papers did not report an experiment on embedding dimension size. They are all using an arbitrary dimension on the order of hundreds (100 and 300 are used more frequently). The lack of experiments for embedding size implies that the performance is not very sensitive to this parameter and only the order of magnitude matters, and also other aspects of the model architecture are more important to investigate.
```

In [19]:
model = Word2Vec(size=100,window=5,min_count=30, sg=0, alpha = 0.025)

In [20]:
model.build_vocab(wordlist)
model.corpus_count

14640

In [21]:
model.wv.vocab.keys()

dict_keys(['virginamerica', 'said', 'plus', 'experience', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip', 'really', 'amp', 'little', 'big', 'bad', 'thing', 'seriously', 'would', 'pay', '30', 'flight', 'seats', 'flying', 'yes', 'every', 'time', 'fly', 'go', 'away', 'missed', 'without', 'https', 'co', 'well', 'amazing', 'arrived', 'hour', 'early', 'good', 'know', 'second', 'cause', '10', '24', 'lt', '3', 'pretty', 'much', 'better', 'great', 'deal', 'already', '2nd', 'even', '1st', 'yet', 'u', 'travel', 'http', 'thanks', 'sfo', 'schedule', 'still', 'mia', 'first', 'country', 'lax', 'mco', 'heard', 'nothing', 'things', 'virgin', 'flew', 'nyc', 'last', 'week', 'sit', 'seat', 'due', 'two', 'either', 'help', 'awesome', 'bos', 'fll', 'please', 'want', 'may', 'three', 'times', 'available', 'love', 'feel', 'making', 'gt', 'las', 'non', 'stop', 'soon', 'guys', 'friends', 'gave', 'free', 'status', 'weeks', 'called', 'response', 'happened', '2', 'ur', 'food', 'options', 'least', 'say', 

In [22]:
model.train(wordlist, total_examples=model.corpus_count, epochs=100)

(8553951, 15367300)

In [23]:
model['month']

  """Entry point for launching an IPython kernel.


array([ 0.48231843,  1.1854167 ,  1.3057796 ,  2.704182  ,  0.37052798,
        2.0038793 ,  0.9172721 ,  0.6268933 ,  0.84013146,  0.74631107,
        1.8647486 , -2.1524239 ,  2.1252522 , -1.7364016 ,  3.2661128 ,
       -0.24306428,  0.885861  , -0.02458498, -0.01559705,  1.1451176 ,
       -2.7282014 ,  2.8703492 , -0.02892786,  1.9754671 , -1.2780055 ,
       -1.8831369 ,  0.34850353, -0.10792898,  1.4820467 , -1.0494508 ,
        1.0482062 , -0.69732994, -1.3389028 ,  1.0235748 ,  0.63351935,
       -1.0640191 , -1.2512885 ,  0.43683174, -1.0858021 ,  0.90441006,
       -0.5989638 , -2.4551308 , -1.4814669 , -1.1503303 , -0.83055186,
        2.402597  , -0.7774185 ,  1.0959852 ,  0.2802829 ,  1.377494  ,
        0.61431944,  0.29719478, -0.45385975, -0.37440434, -1.2918323 ,
       -3.6284623 ,  3.436852  ,  0.5862328 ,  0.06993027,  1.768561  ,
       -1.1991277 ,  0.97763187, -0.8115031 ,  0.81149924,  2.271642  ,
        0.16454464,  0.20165619,  1.1812941 ,  1.6315712 ,  1.66

In [35]:
model['year']

  """Entry point for launching an IPython kernel.


array([ 0.7351747 ,  1.1413084 ,  0.5122892 , -0.99221253, -1.931554  ,
        3.7524939 ,  0.7697424 , -0.53768146,  0.5430797 ,  0.7762832 ,
        4.0712166 , -0.20129006,  2.2683547 , -1.0918876 ,  1.3096377 ,
       -0.10507166,  1.6186359 ,  0.8245387 ,  0.85729265, -1.2718667 ,
        1.0588065 ,  2.926049  , -2.3719544 ,  0.3972117 , -0.26262942,
       -0.54822576,  1.3490062 , -0.8417923 ,  0.27009058, -0.80906385,
        1.2567332 , -1.0184585 ,  0.31484777,  1.260187  , -1.7579784 ,
       -2.6931052 ,  1.2413775 , -1.5042392 , -1.7027707 ,  2.6120007 ,
       -0.95892274, -1.7700498 , -3.2436838 , -1.3693079 , -2.450128  ,
        3.741627  , -0.3456695 , -0.4773855 ,  0.07111122, -0.06782713,
        1.281503  ,  2.1346712 , -1.1422708 , -1.7912444 , -1.1381606 ,
       -2.446876  ,  0.74280083,  0.25676063,  0.54744977, -1.7652304 ,
       -3.750914  ,  1.7294987 , -0.27741864,  0.19899717,  4.037618  ,
        1.4246479 ,  0.58231133,  1.869954  ,  1.0336992 ,  0.33

In [34]:
model.similarity('month','year')

  """Entry point for launching an IPython kernel.


0.51448256

In [33]:
model.most_similar('month', topn=20)

  """Entry point for launching an IPython kernel.


[('year', 0.5144824981689453),
 ('week', 0.4374368190765381),
 ('months', 0.3897380828857422),
 ('weeks', 0.3710346221923828),
 ('trip', 0.3644384443759918),
 ('leg', 0.34222888946533203),
 ('night', 0.29254746437072754),
 ('child', 0.27689963579177856),
 ('years', 0.2697194218635559),
 ('days', 0.2673693895339966),
 ('awful', 0.257131963968277),
 ('bought', 0.25409895181655884),
 ('11', 0.2468951940536499),
 ('day', 0.24680420756340027),
 ('000', 0.2437877207994461),
 ('virgin', 0.24148401618003845),
 ('points', 0.24074023962020874),
 ('miles', 0.24056807160377502),
 ('minute', 0.23998117446899414),
 ('fun', 0.23998035490512848)]

In [39]:

model_uf = Word2Vec(size=100,window=5,min_count=30, sg=0)
model_uf.build_vocab(wordlist)
model_uf.train(wordlist, total_examples=model.corpus_count, epochs=200)
model_uf.most_similar('month')

  after removing the cwd from sys.path.


[('year', 0.48174652457237244),
 ('week', 0.3972211480140686),
 ('months', 0.3078358769416809),
 ('leg', 0.2995736598968506),
 ('weeks', 0.29844123125076294),
 ('awful', 0.29741552472114563),
 ('night', 0.2811007499694824),
 ('trip', 0.28102344274520874),
 ('running', 0.2368464469909668),
 ('whole', 0.233427032828331)]