In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /home/neosoft/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("tennis_articles.csv", encoding = 'unicode-escape')

In [3]:
df.head(4)

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP)  Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...


In [4]:
df.iloc[0,2]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same

In [5]:

sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

In [6]:
sentences = [y for x in sentences for y in x]

In [7]:
word_embeddings = {}
f = open('./glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

len(word_embeddings)

400000

In [9]:
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

In [10]:
clean_sentences = [s.lower() for s in clean_sentences]

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/neosoft/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [12]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [14]:
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]


In [16]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [32]:
sentence_vectors

[array([ 0.09423097,  0.13669517,  0.58544935,  0.20381502, -0.1671801 ,
         0.34834924,  0.17839483,  0.37101227, -0.20376579, -0.12805886,
         0.36699214,  0.07576428, -0.03720036,  0.22964005, -0.31566828,
        -0.0996928 , -0.00833809,  0.27326583, -0.24724656,  0.4564281 ,
        -0.12987752,  0.33414586,  0.12960431,  0.72200727,  0.2367894 ,
        -0.08360017,  0.1354242 , -0.9088701 ,  0.59379951,  0.1741106 ,
        -0.31913759,  0.25016489,  0.22933419, -0.19942921,  0.35909751,
        -0.09988502, -0.43280267,  0.33715736, -0.17687914, -0.12392576,
        -0.09833796, -0.183877  ,  0.21569655, -0.36719825,  0.11650406,
        -0.23125742, -0.18450755, -0.0596488 ,  0.45691286, -0.21181901,
        -0.16230433, -0.30980917,  0.14066741,  0.19803775,  0.09769779,
        -1.35030617, -0.01634588,  0.26587548,  0.40476317,  0.68378705,
        -0.23487752,  0.42370602, -0.24793846,  0.25880627, -0.07375703,
        -0.17501676,  0.03179123,  0.43157982,  0.1

In [17]:
sim_mat = np.zeros([len(sentences), len(sentences)])


In [18]:
sim_mat.shape

(130, 130)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]


In [23]:
import networkx as nx

In [24]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [28]:
scores

{0: 0.0073776808835349025,
 1: 0.007885307995562705,
 2: 0.007385254137362013,
 3: 0.008245138253656881,
 4: 0.008653339402971555,
 5: 0.006769978060251003,
 6: 0.007605669110019532,
 7: 0.008071702701698272,
 8: 0.0076246992485529685,
 9: 0.007841456093746977,
 10: 0.00755014818761577,
 11: 0.0011691494275410529,
 12: 0.008213097732054768,
 13: 0.007636257696345818,
 14: 0.00766021150280838,
 15: 0.007910505976348903,
 16: 0.008055288322413522,
 17: 0.007357256066827423,
 18: 0.007235249541111245,
 19: 0.007076514134686431,
 20: 0.007327874116145199,
 21: 0.008241592557948641,
 22: 0.008378684685734482,
 23: 0.00696613664699737,
 24: 0.0068957697954534995,
 25: 0.008208558268979954,
 26: 0.008014121495525791,
 27: 0.006065158231465884,
 28: 0.007780230248331551,
 29: 0.008427139656493592,
 30: 0.008563944584977763,
 31: 0.008458701558391341,
 32: 0.008637406522327636,
 33: 0.008022606023429368,
 34: 0.007461507351723913,
 35: 0.008297981501389615,
 36: 0.008255276765691495,
 37: 0.007

In [26]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [27]:
for i in range(10):
  print(ranked_sentences[i][1])

I was on a nice trajectorythen, Reid recalled.If I hadnt got sick, I think I could have started pushing towards the second week at the slams and then who knows. Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps.
Full effort Nick could live out his tennis like a (Tomas) Berdych or (Jo- Wilfried) Tsonga, consistently making second week,quarters, semis, finals of slams - and then hopefully more.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
I just felt like it really kind of changed where people were a little bit, definitely in the '90s, a lot more quiet, into themselves, and then it started to become better. Meanwhile, Federer is hoping he can improve his service game as he hunts his ninth Swiss Indoors title this week.
I felt like the best weeks that I had to get to know pla