In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx
from readability import Readability
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Sanchana
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sanchana
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,Name,Review_id,Rating,Review
0,PES UNIVERSITY,1,4.4,Placements: Almost 90% of students got placed ...
1,PES UNIVERSITY,2,4.6,Placements: Our college is the best for placem...
2,PES UNIVERSITY,3,3.8,Placements: Students are not recruited in our ...
3,PES UNIVERSITY,4,4.6,Placements: Almost 95% of students got placed ...
4,PES UNIVERSITY,5,4.0,"Placements: In the 2021 batch, the salary pack..."


### Splitting reviews into features

In [5]:

Placement ={}
Infrastructure={}
Faculty={}
Other = {}

for i in range(100):
    college = data.iloc[i,0]
    review = data.iloc[i,3]
    if college not in Placement:
        Placement[college]=list()
        Infrastructure[college]=list()
        Faculty[college]=list()
        Other[college]=list()
    rev_list = re.split('\n',review)
    Placement[college].append(rev_list[0][12:])
    Infrastructure[college].append(rev_list[2][15:])
    Faculty[college].append(rev_list[4][8:])
    if len(rev_list)==7:
        Other[college].append(rev_list[6][7:])

### Pre-processing

In [6]:
def read_article(text):
    
    sentences =[]
    
    sentences = sent_tokenize(text)
    for sentence in sentences:
        sentence.replace("[^a-zA-Z0-9]"," ")

    return sentences

In [7]:
def sentence_similarity(sent1,sent2,stopwords=None):
    if stopwords is None:
        stopwords = []
    
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    #build the vector for the first sentence
    for w in sent1:
        if not w in stopwords:
            vector1[all_words.index(w)]+=1
    
    #build the vector for the second sentence
    for w in sent2:
        if not w in stopwords:
            vector2[all_words.index(w)]+=1
            
    return 1-cosine_distance(vector1,vector2)

In [8]:
def build_similarity_matrix(sentences,stop_words):
    #create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences),len(sentences)))
    
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1!=idx2:
                similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],sentences[idx2],stop_words)
                
    return similarity_matrix

In [9]:
def generate_summary(text,top_n):
    

    summarize_text = []
    
    # Step1: read text and tokenize
    sentences = read_article(text)
    sentences= [*set(sentences)]
    # Steo2: generate similarity matrix across sentences
    sentence_similarity_matrix = build_similarity_matrix(sentences,stop_words)
    
    # Step3: Rank sentences in similarirty matrix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    #Step4: sort the rank and place top sentences
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)),reverse=True)
    
    # Step 5: get the top n number of sentences based on rank    
    for i in range(top_n):
        summarize_text.append(ranked_sentences[i][1])
    
    # Step 6 : outpur the summarized version
    return "\n".join(summarize_text),len(sentences)

In [10]:
text = ' '.join(Placement["Reva University"])
print("Placements: "+generate_summary(text,3)[0])
text = ' '.join(Infrastructure["Reva University"])
print("Infrastructure: "+generate_summary(text,3)[0])
lol = ' '.join(Faculty["PES UNIVERSITY"])
print("Faculty: "+generate_summary(lol,3)[0])
text = ' '.join(Other["Reva University"])
print("Other: "+generate_summary(text,3)[0])

Placements: More than 90% of students get placed every year with good salary packages.
The salary package offered ranges between 3 LPA - 40 LPA.
Around 300 students out of 350 got placed in very good companies and rest all opted for higher studies.
Infrastructure: Infrastructure is good and well framed with respect to many centres providing ample of opportunities for the students to create a different vibes Hostel facility are also good providing students to outsource all the college facilities such as gymnastics sports practice libraries etc.
The library facility is very good in our college with a very wide range of books.
Hostels were not good but the food served was good and delicious.
Faculty: Hence their course content is up to date The exams are pretty difficult- they are based solely on the concepts taught in class.
The course curriculum is relevant and helpful to the students when it comes to placements.
Semester exams are quite difficult, and we do not have any internal choice

In [19]:
data.head()
len(data)

180

In [43]:
import spacy
from textstat.textstat import textstatistics
import math as m 

# Splits the text into sentences, using
# Spacy's sentence segmentation which can
# be found at https://spacy.io/usage/spacy-101
def break_sentences(text):
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	return list(doc.sents)

# Returns Number of Words in the text
def word_count(text):
	sentences = break_sentences(text)
	words = 0
	for sentence in sentences:
		words += len([token for token in sentence])
	return words

# Returns the number of sentences in the text
def sentence_count(text):
	sentences = break_sentences(text)
	return len(sentences)

# Returns average sentence length
def avg_sentence_length(text):
	words = word_count(text)
	sentences = sentence_count(text)
	average_sentence_length = float(words / sentences)
	return average_sentence_length

# Textstat is a python package, to calculate statistics from
# text to determine readability,
# complexity and grade level of a particular corpus.
# Package can be found at https://pypi.python.org/pypi/textstat
def syllables_count(word):
	return textstatistics().syllable_count(word)

# Returns the average number of syllables per
# word in the text
def avg_syllables_per_word(text):
	syllable = syllables_count(text)
	words = word_count(text)
	ASPW = float(syllable) / float(words)
	return round(ASPW, 1)

# Return total Difficult Words in a text
def difficult_words(text):
	
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	# Find all words in the text
	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [str(token) for token in sentence]

	# difficult words are those with syllables >= 2
	# easy_word_set is provide by Textstat as
	# a list of common words
	diff_words_set = set()
	
	for word in words:
		syllable_count = syllables_count(word)
		if word not in nlp.Defaults.stop_words and syllable_count >= 2:
			diff_words_set.add(word)

	return len(diff_words_set)

# A word is polysyllablic if it has more than 3 syllables
# this functions returns the number of all such words
# present in the text
def poly_syllable_count(text):
	count = 0
	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [token for token in sentence]
	

	for word in words:
		syllable_count = syllables_count(word)
		if syllable_count >= 3:
			count += 1
	return count


def flesch_reading_ease(text):
	"""
		Implements Flesch Formula:
		Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW)
		Here,
		ASL = average sentence length (number of words
				divided by number of sentences)
		ASW = average word length in syllables (number of syllables
				divided by number of words)
	"""
	FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
		float(84.6 * avg_syllables_per_word(text))
	return round(FRE, 2)


def gunning_fog(text):
	per_diff_words = (difficult_words(text) / word_count(text) * 100) + 5
	grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
	return grade


def smog_index(text):
	"""
		Implements SMOG Formula / Grading
		SMOG grading = 3 + ?polysyllable count.
		Here,
		polysyllable count = number of words of more
		than two syllables in a sample of 30 sentences.
	"""
	check = sentence_count(text)
	if  check>= 3:
		poly_syllab = poly_syllable_count(text)
		SMOG = (1.043 * (30*(poly_syllab / check))**0.5) \
				+ 3.1291
		return round(SMOG, 1)
	else:
		return 0


def dale_chall_readability_score(text):
	"""
		Implements Dale Challe Formula:
		Raw score = 0.1579*(PDW) + 0.0496*(ASL) + 3.6365
		Here,
			PDW = Percentage of difficult words.
			ASL = Average sentence length
	"""
	words = word_count(text)
	# Number of words not termed as difficult words
	y= difficult_words(text)
	c = words - y 
	if words > 0:

		# Percentage of words not on difficult word list

		per = float(c) / float(words) * 100
	
	# diff_words stores percentage of difficult words
	diff_words = 100 - per
	z = avg_sentence_length(text)
	raw_score = (0.1579 * diff_words) + \
				(0.0496 * z)
	
	# If Percentage of Difficult Words is greater than 5 %, then;
	# Adjusted Score = Raw Score + 3.6365,
	# otherwise Adjusted Score = Raw Score

	if diff_words > 5:	

		raw_score += 3.6365
		
	return round(raw_score, 2)


In [49]:
data['dale-chall']=np.zeros(len(data))
data['smog-index'] = np.zeros(len(data))
data["gunning_fog"]= np.zeros(len(data))
data["fre"] = np.zeros(len(data)) 

data.iloc[i,5]=smog_index(data.iloc[i,3])
data.iloc[i,6] = gunning_fog(data.iloc[i,3])
data.iloc[i,7]=flesch_reading_ease(data.iloc[i,3])
data.head()


TypeError: 'int' object is not callable

In [45]:
for i in range(len(data)):
    data.iloc[i,4]=dale_chall_readability_score(data.iloc[i,3])
data.head()


Unnamed: 0,Name,Review_id,Rating,Review,dale-chall
0,PES UNIVERSITY,1,4.4,Placements: Almost 90% of students got placed ...,8.04
1,PES UNIVERSITY,2,4.6,Placements: Our college is the best for placem...,0.0
2,PES UNIVERSITY,3,3.8,Placements: Students are not recruited in our ...,0.0
3,PES UNIVERSITY,4,4.6,Placements: Almost 95% of students got placed ...,0.0
4,PES UNIVERSITY,5,4.0,"Placements: In the 2021 batch, the salary pack...",0.0


In [46]:

data.head()

Unnamed: 0,Name,Review_id,Rating,Review,dale-chall
0,PES UNIVERSITY,1,4.4,Placements: Almost 90% of students got placed ...,8.04
1,PES UNIVERSITY,2,4.6,Placements: Our college is the best for placem...,0.0
2,PES UNIVERSITY,3,3.8,Placements: Students are not recruited in our ...,0.0
3,PES UNIVERSITY,4,4.6,Placements: Almost 95% of students got placed ...,0.0
4,PES UNIVERSITY,5,4.0,"Placements: In the 2021 batch, the salary pack...",0.0


In [None]:
//*[@id="__next"]/div[2]/section/div/div[1]/div/div[3]/div/div[6]/div[1]/div[2]/span
<span class="jsx-2425621939 text-md pointer text-capitalize py-1 px-2 text-dark-blue d-inline-block mb-3">Load more reviews</span>
<span class="jsx-2425621939 text-md pointer text-capitalize py-1 px-2 text-dark-blue d-inline-block mb-3">Load more reviews</span>