In [1]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from urllib.request import urlretrieve
import zipfile
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# GloVe

In [2]:
glove_dict = dict()

data_path = "../data/"
f = open(data_path + "glove.6B.100d.txt", "r", encoding="utf-8")

for line in f:
    chunk = line.split()
    word = chunk[0]
    vector = np.array(chunk[1:], dtype=np.float32)
    glove_dict[word] = vector
    
print("Glove dictionary size: ", len(glove_dict))

Glove dictionary size:  400000


In [3]:
df = pd.read_csv(data_path + "tennis_articles_v4.csv")
print(len(df))
print(df.head(5))

8
   article_id                                       article_text  \
0           1  Maria Sharapova has basically no friends as te...   
1           2  BASEL, Switzerland (AP), Roger Federer advance...   
2           3  Roger Federer has revealed that organisers of ...   
3           4  Kei Nishikori will try to end his long losing ...   
4           5  Federer, 37, first broke through on tour over ...   

                                              source  
0  https://www.tennisworldusa.org/tennis/news/Mar...  
1  http://www.tennis.com/pro-game/2018/10/copil-s...  
2  https://scroll.in/field/899938/tennis-roger-fe...  
3  http://www.tennis.com/pro-game/2018/10/nishiko...  
4  https://www.express.co.uk/sport/tennis/1036101...  


# Data Preprocessing

In [4]:
data = df[['article_text']]
data['sentences'] = data['article_text'].apply(sent_tokenize)
print(data.head(5))

                                        article_text  \
0  Maria Sharapova has basically no friends as te...   
1  BASEL, Switzerland (AP), Roger Federer advance...   
2  Roger Federer has revealed that organisers of ...   
3  Kei Nishikori will try to end his long losing ...   
4  Federer, 37, first broke through on tour over ...   

                                           sentences  
0  [Maria Sharapova has basically no friends as t...  
1  [BASEL, Switzerland (AP), Roger Federer advanc...  
2  [Roger Federer has revealed that organisers of...  
3  [Kei Nishikori will try to end his long losing...  
4  [Federer, 37, first broke through on tour over...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentences'] = data['article_text'].apply(sent_tokenize)


In [5]:
stop_words = stopwords.words('english')

def tokenize(sentence):
    return [word_tokenize(sentence) for sentence in sentence]

def preprocess_sentence(sentence):
    sentence_processed = [re.sub(r'[^a-zA-z\s]', '', word.lower()) for word in sentence]
    return [word for word in sentence_processed if word not in stop_words and word != '']

def preprocess_sentences(sentences):
    return [preprocess_sentence(sentence) for sentence in sentences]

In [6]:
data['tokenized_sentences'] = data['sentences'].apply(tokenize)
print(data.head(5))
data['tokenized_sentences'] = data['tokenized_sentences'].apply(preprocess_sentences)
print(data.head(5))

                                        article_text  \
0  Maria Sharapova has basically no friends as te...   
1  BASEL, Switzerland (AP), Roger Federer advance...   
2  Roger Federer has revealed that organisers of ...   
3  Kei Nishikori will try to end his long losing ...   
4  Federer, 37, first broke through on tour over ...   

                                           sentences  \
0  [Maria Sharapova has basically no friends as t...   
1  [BASEL, Switzerland (AP), Roger Federer advanc...   
2  [Roger Federer has revealed that organisers of...   
3  [Kei Nishikori will try to end his long losing...   
4  [Federer, 37, first broke through on tour over...   

                                 tokenized_sentences  
0  [[Maria, Sharapova, has, basically, no, friend...  
1  [[BASEL, ,, Switzerland, (, AP, ), ,, Roger, F...  
2  [[Roger, Federer, has, revealed, that, organis...  
3  [[Kei, Nishikori, will, try, to, end, his, lon...  
4  [[Federer, ,, 37, ,, first, broke, through, on..

# Sentence Embedding

In [7]:
# sentence embedding using GloVe
# calculate the average of all word vectors in a sentence
embedding_dims = 100
zero_vector = np.zeros(embedding_dims) # zero vector for out of vocabulary words (OOV)

def calculate_sentence_vector(sentence):
    if len(sentence) != 0:
        result = sum([glove_dict.get(word, zero_vector) for word in sentence]) / len(sentence)
        return result
    else:
        return zero_vector
    
def calculate_sentence_vectors(sentences):
    return [calculate_sentence_vector(sentence) for sentence in sentences]

### dict.get(key, default_value)
 - return the value for key if key is in the dictionary, else default. If default is not given, it defaults to None, so that this method never raises a KeyError.

In [9]:
data['sentence_embedding'] = data['tokenized_sentences'].apply(calculate_sentence_vectors)
data.head(5)

Unnamed: 0,article_text,sentences,tokenized_sentences,sentence_embedding
0,Maria Sharapova has basically no friends as te...,[Maria Sharapova has basically no friends as t...,"[[maria, sharapova, basically, friends, tennis...","[[0.051488996, 0.1105585, 0.6950863, 0.1891917..."
1,"BASEL, Switzerland (AP), Roger Federer advance...","[BASEL, Switzerland (AP), Roger Federer advanc...","[[basel, switzerland, ap, roger, federer, adva...","[[0.10566062456928194, -0.10534465219825506, 0..."
2,Roger Federer has revealed that organisers of ...,[Roger Federer has revealed that organisers of...,"[[roger, federer, revealed, organisers, relaun...","[[-0.022272188, -0.0474477, 0.14933074, -0.086..."
3,Kei Nishikori will try to end his long losing ...,[Kei Nishikori will try to end his long losing...,"[[kei, nishikori, try, end, long, losing, stre...","[[0.045201838, -0.064647146, 0.5035717, -0.160..."
4,"Federer, 37, first broke through on tour over ...","[Federer, 37, first broke through on tour over...","[[federer, first, broke, tour, two, decades, a...","[[0.21536233, 0.180915, 0.25600883, 0.06924241..."
