# Create vector representations of titles in dataset 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk, collections, re, string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


In [2]:
data = pd.read_csv('Eluvio_DS_Challenge.csv')

In [3]:
data

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews
...,...,...,...,...,...,...,...,...
509231,1479816764,2016-11-22,5,0,Heil Trump : Donald Trump s alt-right white...,False,nonamenoglory,worldnews
509232,1479816772,2016-11-22,1,0,There are people speculating that this could b...,False,SummerRay,worldnews
509233,1479817056,2016-11-22,1,0,Professor receives Arab Researchers Award,False,AUSharjah,worldnews
509234,1479817157,2016-11-22,1,0,Nigel Farage attacks response to Trump ambassa...,False,smilyflower,worldnews


### clean titles 

In [4]:
punctuation_numbers_to_exclude = r"[\d+{}’‘“”…£]".format(string.punctuation)
stop_words = [re.sub(punctuation_numbers_to_exclude, "", stop) for stop in stopwords.words('english')]
ss = SnowballStemmer('english')

def clean_text(text, remove_punctuation=True, remove_stopwords=True, stem=True):
    
    text=text.lower()
    
    if remove_punctuation:
        #remove punctuation and numbers
        text = re.sub(punctuation_numbers_to_exclude, "", text)
    
    if remove_stopwords:
        #remove stopwords
        text = [word for word in text.split() if word not in stop_words]
   
    if stem:
        #stem
        text = [ss.stem(word) for word in text]
    
    return ' '.join(text)


In [5]:
cleaned_titles_no_stemming = [clean_text(title, stem=False) for title in data['title'].values]

In [6]:
data['title'].values[:10]

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border',
       'Jump-start economy: Give health care to all ',
       'Council of Europe bashes EU&UN terror blacklist',
       'Hay presto! Farmer unveils the  illegal  mock-Tudor castle he tried to hide behind 40ft hay bales',
       'Strikes, Protests and Gridlock at the Poland-Ukraine Border',
       'The U.N. Mismanagement Program',
       'Nicolas Sarkozy threatens to sue Ryanair ',
       'US plans for missile shields in Polish town met with resistance [video]'],
      dtype=object)

In [7]:
cleaned_titles_no_stemming[:10]

['scores killed pakistan clashes',
 'japan resumes refuelling mission',
 'us presses egypt gaza border',
 'jumpstart economy give health care',
 'council europe bashes euun terror blacklist',
 'hay presto farmer unveils illegal mocktudor castle tried hide behind ft hay bales',
 'strikes protests gridlock polandukraine border',
 'un mismanagement program',
 'nicolas sarkozy threatens sue ryanair',
 'us plans missile shields polish town met resistance video']

## Use pretrained [GloVe](https://nlp.stanford.edu/projects/glove/) model to create vector embeddings for words in titles

In [8]:
#load pretrained model

embeddings_dict = {}
dimensions = 50
with open("glove.6b/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

### Create array of vector embeddings from titles.

In [9]:
titles_vectors = []

#using nonstemming because word variants with the same stem often appear with separate embeddings.
for title in cleaned_titles_no_stemming:
    title_words_vectors = []
    for word in title.lower().split():
        try:
            title_words_vectors.append(embeddings_dict[word])
        except KeyError: #if word not in model, then pass
            pass
    
    #if there are no words from title in GloVe model, return array of nans, else return mean of embedding values for all words in title
    if len(title_words_vectors) == 0:
        titles_vectors.append(np.full(dimensions, np.nan))
    else:
        titles_vectors.append(np.mean(title_words_vectors, axis=0))

In [10]:
titles_vectors[0]

array([ 3.9545023e-01, -4.8771000e-01,  5.7768625e-01, -9.1156498e-02,
        1.8073300e-01,  1.7479993e-02, -2.5747500e-02,  5.7215750e-01,
        3.0598015e-02, -4.1651750e-01, -3.6815000e-01, -1.3932850e+00,
       -6.4465249e-01, -4.2249000e-01,  2.8884250e-01, -1.5836248e-01,
        8.9348249e-02, -1.4332525e-01, -8.2170749e-01,  5.5123746e-01,
       -2.1498752e-01,  9.0665251e-01,  6.1633497e-01,  3.8069525e-01,
       -6.0425989e-02, -1.3075874e+00, -6.3089997e-02, -4.4174001e-01,
       -9.9862486e-02,  3.5555500e-01,  2.6287999e+00,  2.5334752e-01,
       -3.7351996e-02, -1.0227509e-02,  6.0472500e-01,  6.7483252e-01,
        9.7777516e-02, -9.7192156e-01, -5.1433748e-01,  5.6634498e-01,
       -6.8620503e-01,  5.3959477e-01,  1.1125749e-01, -3.3677000e-01,
        8.0702752e-01,  1.8408252e-01, -5.1430500e-01,  4.4885755e-01,
        1.1374950e-03, -1.1982850e+00], dtype=float32)

## simple check by finding most similar title using smallest distance in embedding space 

In [11]:
from scipy.spatial.distance import pdist, squareform
import random

In [34]:
test_sample = titles_vectors[:10000]

#mask out titles that had no words in GloVe and are represented by nans
nan_mask = np.any(np.isfinite(test_sample), axis=1)

distances = pdist(np.array(test_sample)[nan_mask])

#convert to square matrix
distances_square = squareform(distances)

#diagonals represent a distance of 0, so replace with infinity so it is not selected as shortest distance
np.fill_diagonal(distances_square, np.inf)

closest_points = distances_square.argmin(axis=0)

In [37]:
closest_points

array([8489,  763,   55, ..., 8227, 1131, 4048])

In [73]:
def most_similar_title(index, sample_length=10000):
    titles = data['title'][:sample_length].values
    
    print('- Title: {0} \n'.format(titles[nan_mask][index]))
    
    closest_title = titles[nan_mask][closest_points[index]]
    
    print('- Closest Title: {0}\n'.format(closest_title))
    print('--------------------------------\n')
    

In [75]:
for i in random.sample(range(10000), 10):
    most_similar_title(i)
    

- Title: Cruising for chicks, Saudi Arabian style 

- Closest Title: Hamburger Hill: frontline base in a sea of poppies British soldiers dare not clear

--------------------------------

- Title: Tibetan protesters beaten up in Nepal, 125 detained 

- Closest Title: Police Beat Tibetan Protesters in Nepal

--------------------------------

- Title: The Price of Faithlessness: Iran to Punish Apostasy with Death 

- Closest Title: Iranian woman escapes stoning to death for adultery - did social media sway the Iranians?

--------------------------------

- Title: Amazing picture of the moment a British teacher was attacked by a lion 

- Closest Title: Google rats on Indian expressionist: Big Brother is watching all of us

--------------------------------

- Title:  If he (Mr Bush) can say he has killed Saddam Hussein and captured bin Laden, he can claim to have left the world a safer place  

- Closest Title: If Bin Laden is Still Alive and Still a Threat, and if Bush is All About Winning

### Does pretty well for some titles, not great for others. Not sure how useful it will prove to be for predicting upvotes.