## Import Library

In [0]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx

## Read the Dataset

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/eshitagangwar/Text-Summarization/master/tennis_articles_v4.csv")

In [166]:
df.head()

Unnamed: 0,article_id,article_text,source,summary
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...,I think everyone just thinks because we're ten...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...,Copil upset expectations of a Federer final ag...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...,They only left me three days to decide Federer...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...,The secondseeded Anderson defeated Fernando Ve...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...,I think there is a really nice environment and...


In [146]:
df['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same 

In [147]:
sentences[:5]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl."]

In [148]:
docs = df['article_text'].str.replace('[^A-Za-z\'. ]', '')
docs.head()

0    Maria Sharapova has basically no friends as te...
1    BASEL Switzerland AP Roger Federer advanced to...
2    Roger Federer has revealed that organisers of ...
3    Kei Nishikori will try to end his long losing ...
4    Federer  first broke through on tour over two ...
Name: article_text, dtype: object

In [149]:
stemmer = nltk.stem.PorterStemmer()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [150]:
docs.head()

0    Maria Sharapova has basically no friends as te...
1    BASEL Switzerland AP Roger Federer advanced to...
2    Roger Federer has revealed that organisers of ...
3    Kei Nishikori will try to end his long losing ...
4    Federer  first broke through on tour over two ...
Name: article_text, dtype: object

## Data Pre-Processing

In [0]:
def clean (article):
    
    article = article.split(". ")
    sentences = []

    for sentence in article:
       #print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences




## Calculating cosine similarity

In [0]:
def similar(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 

## Smilarity_matrix

In [0]:
def matrix(sentences, stop_words):
    
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: 
                continue 
            similarity_matrix[idx1][idx2] = similar(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

## Final function for summary

In [154]:
def final(article, top_n=4):
    stop_words = stopwords.words('english')
    summarize_text = []

    
    sentences =  clean(article)

    
    martix = matrix(sentences, stop_words)

    
    graph = nx.from_numpy_array(martix)
    scores = nx.pagerank(graph)

    
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
      

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    
               
    return " ".join(summarize_text)

  
docs.apply(final)

0    I think everyone just thinks because we're ten...
1    Copil upset expectations of a Federer final ag...
2    They only left me three days to decide Federer...
3    The secondseeded Anderson defeated Fernando Ve...
4    I think there is a really nice environment and...
5    Nadal could then play defending champion Jack ...
6    He could lose  points in the next few weeks bu...
7    Federer won the Swiss Indoors last week by bea...
Name: article_text, dtype: object

In [0]:
df['summary'] = docs.apply(final)

In [168]:
df

Unnamed: 0,article_id,article_text,source,summary
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...,I think everyone just thinks because we're ten...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...,Copil upset expectations of a Federer final ag...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...,They only left me three days to decide Federer...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...,The secondseeded Anderson defeated Fernando Ve...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...,I think there is a really nice environment and...
5,6,Nadal has not played tennis since he was force...,https://www.express.co.uk/sport/tennis/1037119...,Nadal could then play defending champion Jack ...
6,7,"Tennis giveth, and tennis taketh away. The end...",http://www.tennis.com/pro-game/2018/10/tennisc...,He could lose points in the next few weeks bu...
7,8,Federer won the Swiss Indoors last week by bea...,https://www.express.co.uk/sport/tennis/1038186...,Federer won the Swiss Indoors last week by bea...
