# Python Text Analysis: Word Embeddings Solutions

In [None]:
import numpy as np
import gensim
import gensim.downloader as api
import pandas as pd
import re

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

In [None]:
wv = api.load('word2vec-google-news-300')

## Challenge 1

Look up the `doesnt_match` function in `gensim`'s documentation. Use this function to identify which word doesn't match in the following group:

banana, apple, strawberry, happy

Then, try it on groups of words that you choose. Here are some suggestions:

1. A group of fruits, and a vegetable. Can it identify that the vegetable doesn't match?
2. A group of vehicles that travel by land, and a vehicle that travels by air (e.g., a plane or helicopter). Can it identify the vehicle that flies?
3. A group of scientists (e.g., biologist, physicist, chemist, etc.) and a person who does not study an empirical science (e.g., an artist). Can it identify the occupation that is not science based?

To be clear, `word2vec` does not learn the precise nature of the differences between these groups. However, the semantic differences correspond to similar words appearing near each other in large corpora.

In [None]:
wv.doesnt_match(['banana', 'apple', 'strawberry', 'happy'])

In [None]:
wv.doesnt_match(['banana', 'apple', 'strawberry', 'carrot'])

In [None]:
wv.doesnt_match(['car', 'bike', 'bus', 'plane'])

In [None]:
wv.doesnt_match(['biologist', 'physicist', 'chemist', 'artist'])

## Challenge 2

Carry out the following word analogies:

1. Mouse : Mice :: Goose : ?
2. Kangaroo : Joey :: Cat : ?
3. United States : Dollar :: Mexico : ?
4. Happy : Sad :: Up : ?
5. California : Sacramento :: Canada : ?
6. California : Sacramento :: Washington : ?

What about something more abstract, such as:

7. United States : hamburger :: Canada : ?

Some work well, and others don't work as well. Try to come up with your own analogies!

In [None]:
wv.most_similar(positive=['mice', 'goose'], negative=['mouse'])

In [None]:
wv.most_similar(positive=['joey', 'cat'], negative=['kangaroo'])

In [None]:
wv.most_similar(positive=['Dollar', 'Mexico'], negative=['United_States'])

In [None]:
wv.most_similar(positive=['sad', 'up'], negative=['happy'])

In [None]:
wv.most_similar(positive=['Sacramento', 'Canada'], negative=['California'])

In [None]:
wv.most_similar(positive=['Sacramento', 'Washington'], negative=['California'])

In [None]:
wv.most_similar(positive=['hamburger', 'Canada'], negative=['United_States'])

## Challenge 3

Try experimenting with different numbers of vector sizes, window sizes, and other parameters available in the `Word2Vec` module. Additionally, try training using skip-grams rather than CBOW.

In [None]:
tweets_path = '../data/airline_tweets.csv')
tweets = pd.read_csv(tweets_path, sep=',')

In [None]:
def preprocess(text):
    """Preprocesses a string."""
    # Lowercase
    text = text.lower()
    # Replace URLs
    url_pattern = r'https?:\/\/.*[\r\n]*'
    url_repl = ' URL '
    text = re.sub(url_pattern, url_repl, text)
    # Replace digits
    digit_pattern = '\d+'
    digit_repl = ' DIGIT '
    text = re.sub(digit_pattern, digit_repl, text)
    # Replace hashtags
    hashtag_pattern = r'(?:^|\s)[＃#]{1}(\w+)'
    hashtag_repl = ' HASHTAG '
    text = re.sub(hashtag_pattern, hashtag_repl, text)
    # Replace users
    user_pattern = r'@(\w+)'
    user_repl = ' USER '
    text = re.sub(user_pattern, user_repl, text)
    # Remove blank spaces
    blankspace_pattern = r'\s+'
    blankspace_repl = ' '
    text = re.sub(blankspace_pattern, blankspace_repl, text).strip()
    return text

In [None]:
tweets['text_processed'] = tweets['text'].apply(lambda x: preprocess(x))
tweets['text_processed'].head()

In [None]:
sentences = [word_tokenize(tweet) for tweet in tweets['text_processed']]

In [None]:
model = Word2Vec(
    sentences=sentences,
    vector_size=50,
    window=5,
    min_count=2,
    sg=1)

In [None]:
model.wv.most_similar('worst')

In [None]:
model.wv.most_similar('great')

## Challenge 4

Write a function that performs the pipeline of building a `word2vec` model and constructing a design matrix. Use this function to try and see if you can change the performance of the model with other parameters (vector sizes, window sizes, etc.).

In [None]:
tweets_binary = tweets[tweets['airline_sentiment'] != 'neutral']
y = tweets_binary['airline_sentiment']

In [None]:
def featurizer(documents, to_train, vector_size=50, window=6, sg=0):
    """Computes a feature matrix from a document corpus."""
    sentences = [word_tokenize(doc) for doc in documents]
    # Train word2vec
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=1,
        sg=sg)
    
    X = np.zeros((len(to_train), vector_size))
    # Enumerate over tweets
    for idx, doc in enumerate(to_train):
        # Tokenize the current tweet
        tokens = word_tokenize(doc)
        n_tokens = len(tokens)
        # Enumerate over tokens, obtaining word vectors
        for token in tokens:
            X[idx] += model.wv.get_vector(token)
        # Take the average
        X[idx] /= n_tokens
    return X

In [None]:
X = featurizer(tweets['text_processed'], tweets_binary['text_processed'], vector_size=80, window=6)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
def fit_logistic_regression(X, y):
    """Fits a logistic regression model to provided data."""
    model = LogisticRegressionCV(
        Cs=5,
        penalty='l2',
        max_iter=1000,
        tol=1e-2,
        cv=3,
        refit=True).fit(X, y)
    return model

In [None]:
# Fit the logistic regression model
fitter = fit_logistic_regression(X_train, y_train)

In [None]:
print(f"Training accuracy: {fitter.score(X_train, y_train)}")
print(f"Test accuracy: {fitter.score(X_test, y_test)}")