## Importing Natural Language toolkit

In [0]:
import nltk


In [0]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

## Reading file

In [0]:
def read_article(file_name):
  file = open(file_name, "r")
  filedata = file.readlines()
  article = filedata[0].split(". ")
  sentences = []

  for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
  sentences.pop()

  return sentences

In [0]:
def sentence_similarity(sent1, sent2, stopwords=None):
  if stopwords is None:
    stopwords = []
 
  sent1 = [w.lower() for w in sent1]
  sent2 = [w.lower() for w in sent2]
 
  all_words = list(set(sent1 + sent2))
 
  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)


# Build the vector for the first sentence

  for w in sent1:
    if w in stopwords:
      continue
  vector1[all_words.index(w)] += 1   


# Build the vector for the second sentence

  for w in sent2:
    if w in stopwords:
      continue
  vector2[all_words.index(w)] += 1

  return 1 - cosine_distance(vector1, vector2)


## Cosine similarity matrix

In [0]:
def build_similarity_matrix(sentences, stop_words):                             # Create an empty similarity matrix
  similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
  for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
      if idx1 == idx2: #ignore if both are same sentences
        continue 
      similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

  return similarity_matrix

In [0]:
def generate_summary(file_name, top_n=5):
  nltk.download("stopwords")
  stop_words = stopwords.words('english')
  summarize_text = []

# Read text anc split it
  sentences =  read_article(file_name)

# Generate Similary Martix across sentences
  sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

# Rank sentences in similarity martix
  sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
  scores = nx.pagerank(sentence_similarity_graph)

# Sort the rank and pick top sentences
  ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
  print("Indexes of top ranked_sentence order are ", ranked_sentence)   


  for i in range(top_n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

  
  print("Summarize Text: \n", ". ".join(summarize_text))

  

In [84]:
generate_summary( "text_ai.txt", 5)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Artificial intelligence is a branch of computer science that aims to create intelligent machines
It has become an essential part of the technology industry.Knowledge engineering is a core part of AI research
Machines can often act and react like humans only if they have abundant information relating to the world
Artificial intelligence must have access to objects, categories, properties and relations between all of them to implement knowledge engineering
Initiating common sense, reasoning and problem-solving power in machines is a difficult and tedious task
Machine learning is also a core part of AI
Learning without any kind of supervision requires an ability to identify patterns in streams of inputs, whereas learning with adequate supervision involves classification and numerical regressions
Classification determines the category an object belongs to and regression de