In [1]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from urllib.request import urlretrieve
import zipfile
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# GloVe

In [2]:
glove_dict = dict()

data_path = "../data/"
f = open(data_path + "glove.6B.100d.txt", "r", encoding="utf-8")

for line in f:
    chunk = line.split()
    word = chunk[0]
    vector = np.array(chunk[1:], dtype=np.float32)
    glove_dict[word] = vector
    
print("Glove dictionary size: ", len(glove_dict))

Glove dictionary size:  400000


In [3]:
df = pd.read_csv(data_path + "tennis_articles_v4.csv")
print(len(df))
print(df.head(5))

8
   article_id                                       article_text  \
0           1  Maria Sharapova has basically no friends as te...   
1           2  BASEL, Switzerland (AP), Roger Federer advance...   
2           3  Roger Federer has revealed that organisers of ...   
3           4  Kei Nishikori will try to end his long losing ...   
4           5  Federer, 37, first broke through on tour over ...   

                                              source  
0  https://www.tennisworldusa.org/tennis/news/Mar...  
1  http://www.tennis.com/pro-game/2018/10/copil-s...  
2  https://scroll.in/field/899938/tennis-roger-fe...  
3  http://www.tennis.com/pro-game/2018/10/nishiko...  
4  https://www.express.co.uk/sport/tennis/1036101...  


# Data Preprocessing

In [4]:
data = df[['article_text']]
data['sentences'] = data['article_text'].apply(sent_tokenize)
print(data.head(5))

                                        article_text  \
0  Maria Sharapova has basically no friends as te...   
1  BASEL, Switzerland (AP), Roger Federer advance...   
2  Roger Federer has revealed that organisers of ...   
3  Kei Nishikori will try to end his long losing ...   
4  Federer, 37, first broke through on tour over ...   

                                           sentences  
0  [Maria Sharapova has basically no friends as t...  
1  [BASEL, Switzerland (AP), Roger Federer advanc...  
2  [Roger Federer has revealed that organisers of...  
3  [Kei Nishikori will try to end his long losing...  
4  [Federer, 37, first broke through on tour over...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentences'] = data['article_text'].apply(sent_tokenize)


In [5]:
stop_words = stopwords.words('english')

def tokenize(sentence):
    return [word_tokenize(sentence) for sentence in sentence]

def preprocess_sentence(sentence):
    sentence_processed = [re.sub(r'[^a-zA-z\s]', '', word.lower()) for word in sentence]
    return [word for word in sentence_processed if word not in stop_words and word != '']

def preprocess_sentences(sentences):
    return [preprocess_sentence(sentence) for sentence in sentences]

In [6]:
data['tokenized_sentences'] = data['sentences'].apply(tokenize)
print(data.head(5))
data['tokenized_sentences'] = data['tokenized_sentences'].apply(preprocess_sentences)
print(data.head(5))

                                        article_text  \
0  Maria Sharapova has basically no friends as te...   
1  BASEL, Switzerland (AP), Roger Federer advance...   
2  Roger Federer has revealed that organisers of ...   
3  Kei Nishikori will try to end his long losing ...   
4  Federer, 37, first broke through on tour over ...   

                                           sentences  \
0  [Maria Sharapova has basically no friends as t...   
1  [BASEL, Switzerland (AP), Roger Federer advanc...   
2  [Roger Federer has revealed that organisers of...   
3  [Kei Nishikori will try to end his long losing...   
4  [Federer, 37, first broke through on tour over...   

                                 tokenized_sentences  
0  [[Maria, Sharapova, has, basically, no, friend...  
1  [[BASEL, ,, Switzerland, (, AP, ), ,, Roger, F...  
2  [[Roger, Federer, has, revealed, that, organis...  
3  [[Kei, Nishikori, will, try, to, end, his, lon...  
4  [[Federer, ,, 37, ,, first, broke, through, on..

# Sentence Embedding

In [7]:
# sentence embedding using GloVe
# calculate the average of all word vectors in a sentence
embedding_dims = 100
zero_vector = np.zeros(embedding_dims) # zero vector for out of vocabulary words (OOV)

def calculate_sentence_vector(sentence):
    if len(sentence) != 0:
        result = sum([glove_dict.get(word, zero_vector) for word in sentence]) / len(sentence)
        return result
    else:
        return zero_vector
    
def calculate_sentence_vectors(sentences):
    return [calculate_sentence_vector(sentence) for sentence in sentences]

### dict.get(key, default_value)
 - return the value for key if key is in the dictionary, else default. If default is not given, it defaults to None, so that this method never raises a KeyError.

In [8]:
data['sentence_embedding'] = data['tokenized_sentences'].apply(calculate_sentence_vectors)
data.head(5)

Unnamed: 0,article_text,sentences,tokenized_sentences,sentence_embedding
0,Maria Sharapova has basically no friends as te...,[Maria Sharapova has basically no friends as t...,"[[maria, sharapova, basically, friends, tennis...","[[0.051488996, 0.1105585, 0.6950863, 0.1891917..."
1,"BASEL, Switzerland (AP), Roger Federer advance...","[BASEL, Switzerland (AP), Roger Federer advanc...","[[basel, switzerland, ap, roger, federer, adva...","[[0.10566062456928194, -0.10534465219825506, 0..."
2,Roger Federer has revealed that organisers of ...,[Roger Federer has revealed that organisers of...,"[[roger, federer, revealed, organisers, relaun...","[[-0.022272188, -0.0474477, 0.14933074, -0.086..."
3,Kei Nishikori will try to end his long losing ...,[Kei Nishikori will try to end his long losing...,"[[kei, nishikori, try, end, long, losing, stre...","[[0.045201838, -0.064647146, 0.5035717, -0.160..."
4,"Federer, 37, first broke through on tour over ...","[Federer, 37, first broke through on tour over...","[[federer, first, broke, tour, two, decades, a...","[[0.21536233, 0.180915, 0.25600883, 0.06924241..."


In [9]:
def similarity_matrix(sentence_embedding):
    sim_mat = np.zeros([len(sentence_embedding), len(sentence_embedding)])
    for i in range(len(sentence_embedding)):
        for j in range(len(sentence_embedding)):
            sim_mat[i][j] = cosine_similarity(sentence_embedding[i].reshape(1, embedding_dims), sentence_embedding[j].reshape(1, embedding_dims))[0,0]
    return sim_mat

In [10]:
data['similarity_matrix'] = data['sentence_embedding'].apply(similarity_matrix)
data.head(5)

Unnamed: 0,article_text,sentences,tokenized_sentences,sentence_embedding,similarity_matrix
0,Maria Sharapova has basically no friends as te...,[Maria Sharapova has basically no friends as t...,"[[maria, sharapova, basically, friends, tennis...","[[0.051488996, 0.1105585, 0.6950863, 0.1891917...","[[1.0, 0.6477208137512207, 0.5915699601173401,..."
1,"BASEL, Switzerland (AP), Roger Federer advance...","[BASEL, Switzerland (AP), Roger Federer advanc...","[[basel, switzerland, ap, roger, federer, adva...","[[0.10566062456928194, -0.10534465219825506, 0...","[[1.0, 0.8745531210608368, 0.8054854557907094,..."
2,Roger Federer has revealed that organisers of ...,[Roger Federer has revealed that organisers of...,"[[roger, federer, revealed, organisers, relaun...","[[-0.022272188, -0.0474477, 0.14933074, -0.086...","[[0.9999998807907104, 0.9130853414535522, 0.89..."
3,Kei Nishikori will try to end his long losing ...,[Kei Nishikori will try to end his long losing...,"[[kei, nishikori, try, end, long, losing, stre...","[[0.045201838, -0.064647146, 0.5035717, -0.160...","[[1.0, 0.7769179305294038, 0.8405090570449829,..."
4,"Federer, 37, first broke through on tour over ...","[Federer, 37, first broke through on tour over...","[[federer, first, broke, tour, two, decades, a...","[[0.21536233, 0.180915, 0.25600883, 0.06924241...","[[0.9999999403953552, 0.8306209732530097, 0.90..."


In [11]:
def calculate_score(sim_matrix):
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    return scores

In [12]:
data['score'] = data['similarity_matrix'].apply(calculate_score)
data.head(5)

Unnamed: 0,article_text,sentences,tokenized_sentences,sentence_embedding,similarity_matrix,score
0,Maria Sharapova has basically no friends as te...,[Maria Sharapova has basically no friends as t...,"[[maria, sharapova, basically, friends, tennis...","[[0.051488996, 0.1105585, 0.6950863, 0.1891917...","[[1.0, 0.6477208137512207, 0.5915699601173401,...","{0: 0.05492159165231015, 1: 0.0628651974204509..."
1,"BASEL, Switzerland (AP), Roger Federer advance...","[BASEL, Switzerland (AP), Roger Federer advanc...","[[basel, switzerland, ap, roger, federer, adva...","[[0.10566062456928194, -0.10534465219825506, 0...","[[1.0, 0.8745531210608368, 0.8054854557907094,...","{0: 0.08315094418018205, 1: 0.0849861134785175..."
2,Roger Federer has revealed that organisers of ...,[Roger Federer has revealed that organisers of...,"[[roger, federer, revealed, organisers, relaun...","[[-0.022272188, -0.0474477, 0.14933074, -0.086...","[[0.9999998807907104, 0.9130853414535522, 0.89...","{0: 0.061484405802359955, 1: 0.062248644459824..."
3,Kei Nishikori will try to end his long losing ...,[Kei Nishikori will try to end his long losing...,"[[kei, nishikori, try, end, long, losing, stre...","[[0.045201838, -0.064647146, 0.5035717, -0.160...","[[1.0, 0.7769179305294038, 0.8405090570449829,...","{0: 0.08164728344152744, 1: 0.0727647935126785..."
4,"Federer, 37, first broke through on tour over ...","[Federer, 37, first broke through on tour over...","[[federer, first, broke, tour, two, decades, a...","[[0.21536233, 0.180915, 0.25600883, 0.06924241...","[[0.9999999403953552, 0.8306209732530097, 0.90...","{0: 0.0556868981277235, 1: 0.05099231773928478..."


In [32]:
def ranked_sentences(sentences, scores, n=3):
    top_scores = sorted(((scores[i],s)
                         for i,s in enumerate(sentences)),
                        reverse=True)
    # print("@")
    # print(top_scores)
    top_n_sentences = [sentence
                       for score,sentence in top_scores[:n]]
    return " ".join(top_n_sentences)

In [33]:
data['summary'] = data.apply(lambda row: ranked_sentences(row['sentences'], row['score']), axis=1)
data.head(5)

Unnamed: 0,article_text,sentences,tokenized_sentences,sentence_embedding,similarity_matrix,score,summary
0,Maria Sharapova has basically no friends as te...,[Maria Sharapova has basically no friends as t...,"[[maria, sharapova, basically, friends, tennis...","[[0.051488996, 0.1105585, 0.6950863, 0.1891917...","[[1.0, 0.6477208137512207, 0.5915699601173401,...","{0: 0.05492159165231015, 1: 0.0628651974204509...",I think just because you're in the same sport ...
1,"BASEL, Switzerland (AP), Roger Federer advance...","[BASEL, Switzerland (AP), Roger Federer advanc...","[[basel, switzerland, ap, roger, federer, adva...","[[0.10566062456928194, -0.10534465219825506, 0...","[[1.0, 0.8745531210608368, 0.8054854557907094,...","{0: 0.08315094418018205, 1: 0.0849861134785175...",Federer had an easier time than in his only pr...
2,Roger Federer has revealed that organisers of ...,[Roger Federer has revealed that organisers of...,"[[roger, federer, revealed, organisers, relaun...","[[-0.022272188, -0.0474477, 0.14933074, -0.086...","[[0.9999998807907104, 0.9130853414535522, 0.89...","{0: 0.061484405802359955, 1: 0.062248644459824...",Major players feel that a big event in late No...
3,Kei Nishikori will try to end his long losing ...,[Kei Nishikori will try to end his long losing...,"[[kei, nishikori, try, end, long, losing, stre...","[[0.045201838, -0.064647146, 0.5035717, -0.160...","[[1.0, 0.7769179305294038, 0.8405090570449829,...","{0: 0.08164728344152744, 1: 0.0727647935126785...",Kei Nishikori will try to end his long losing ...
4,"Federer, 37, first broke through on tour over ...","[Federer, 37, first broke through on tour over...","[[federer, first, broke, tour, two, decades, a...","[[0.21536233, 0.180915, 0.25600883, 0.06924241...","[[0.9999999403953552, 0.8306209732530097, 0.90...","{0: 0.0556868981277235, 1: 0.05099231773928478...","""Not always, but I really feel like in the mid..."


In [34]:
for i in range(0, len(data)):
    print(i+1,'번 문서')
    print('원문 :',data.loc[i].article_text)
    print('')
    print('요약 :',data.loc[i].summary)
    print('')

1 번 문서
원문 : Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're i