In [None]:
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer, euclidean_distance, cosine_distance
from gensim.models import Word2Vec
import re
import urllib
import bs4 as bs

In [None]:
# Get text from wiki article
data = urllib.request.urlopen(f'https://en.wikipedia.org/wiki/Basketball')
article = data.read()
parsed_article = bs.BeautifulSoup(article,'lxml')
paragraphs = parsed_article.find_all('p')
text = ""

for p in paragraphs:
    text += p.text

In [None]:
# Do some processing
text = text[2:-1].replace("\\r\\n", " ")
text = text.replace("\\n", " ")
text = text.replace("\\x0c", " ")
text = ' '.join(text.split()).strip()
text = re.sub(r'\[[0-9]*\]', ' ', text)
text = re.sub(r'\s+', ' ', text)

In [None]:
# Split text into sentences
sentences = sent_tokenize(text)

In [None]:
# Create a cleaned version of the text
cleaned_txt = []
for i in range(len(sentences)):
    sen = re.sub('[^a-zA-Z]', " ", sentences[i])  
    sen = sen.lower()                            
    sen = sen.split()                         
    sen = ' '.join([i for i in sen if i not in stopwords.words('english')])   
    cleaned_txt.append(sen)
    
all_words = [i.split() for i in cleaned_txt]

In [None]:
model = Word2Vec(all_words, min_count=1)

In [None]:
# Create vectors from sentences
sent_vector=[]
for i in cleaned_txt:
    plus=0
    for j in i.split():
        plus+= model.wv[j]
    plus = plus/len(i.split())
    
    sent_vector.append(plus)

In [None]:
distance = "euclidean"
if distance == "cosine":
    distance = cosine_distance
elif distance == "euclidean":
    distance = euclidean_distance

In [None]:
# Initialize clusterer
n_clusters = 8
kclusterer = KMeansClusterer(num_means=n_clusters, distance=distance)

In [None]:
# Get clusters for sentence vectors and the centroids of the clusters
clusters = kclusterer.cluster(sent_vector, True)    # Cluster indices for each sentence
means = kclusterer.means()  # Vector for each centroid

In [None]:
from scipy.spatial import distance

summary_indices = []  # List of sentence indices for summary
for cluster_index in range(n_clusters):
    distances = {}
    for j in range(len(clusters)):  # Loop through the sentence clusters
        if clusters[j] == cluster_index:
            # Calculate the distance between the cluster's centroid and the sentence vector
            distances[j] = distance.euclidean(means[cluster_index], sent_vector[j])

    # Add index of the sentence closest to the cluster's centroid to the summary indices list
    summary_indices.append(min(distances, key=distances.get))

summary = []
for i in summary_indices:
    summary.append(sentences[i])
summary = " ".join(summary)

In [None]:
print(summary)