### Generate a single summary for all the articles in the dataset.

#### Import required libraries

In [1]:
# Necessary imports
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import re
import networkx as nx
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/san/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Read the dataset into a dataframe

In [2]:
df = pd.read_csv("../data/sports_articles.csv")

#### Set the index

In [3]:
df.set_index('article_id',inplace=True)

#### Examine the data 

In [4]:
df.head()

Unnamed: 0_level_0,article_text,source
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


#### Important features

There are three features in the sports articles dataset 
1. article_id
2. article_text
3. source

The most important feature of these is 'article_text' which contains the text of articles.

#### Examine sample article text using below

In [5]:
df['article_text'][1]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same 

#### Split Text into Sentences

In [6]:
# Initialize n empty array
sentences = []

# Iterate over each article
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

# Flatten the list
sentences = [y for x in sentences for y in x] 

In [7]:
# Examine the sentences after the split
sentences[:5]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl."]

#### Text Preprocessing

In [8]:
# Remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# Translate alphabets to lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [9]:
# Set stopwords to English
stop_words = stopwords.words('english')

In [10]:
# Define function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [11]:
# Remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [12]:
# TF-IDF Vector Representation of Sentences
vectorizer = TfidfVectorizer(norm = False, smooth_idf = False)
sentence_vectors = vectorizer.fit_transform(sentences)

In [13]:
#print(sentence_vectors)

#### Compute similarity between a pair of sentences using Cosine Similarity 

In [14]:
# Initialize similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])
print(sim_mat)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
# Initialize the matrix with cosine similarity scores.
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i], sentence_vectors[j])[0,0]