In [23]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx

nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Sanchana
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sanchana
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,Name,Review_id,Rating,Review
0,PES UNIVERSITY,1,4.4,Placements: Almost 90% of students got placed ...
1,PES UNIVERSITY,2,4.6,Placements: Our college is the best for placem...
2,PES UNIVERSITY,3,3.8,Placements: Students are not recruited in our ...
3,PES UNIVERSITY,4,4.6,Placements: Almost 95% of students got placed ...
4,PES UNIVERSITY,5,4.0,"Placements: In the 2021 batch, the salary pack..."


### Splitting reviews into features

In [43]:

Placement ={}
Infrastructure={}
Faculty={}
Other = {}

for i in range(100):
    college = data.iloc[i,0]
    review = data.iloc[i,3]
    if college not in Placement:
        Placement[college]=list()
        Infrastructure[college]=list()
        Faculty[college]=list()
        Other[college]=list()
    rev_list = re.split('\n',review)
    Placement[college].append(rev_list[0][12:])
    Infrastructure[college].append(rev_list[2][15:])
    Faculty[college].append(rev_list[4][8:])
    if len(rev_list)==7:
        Other[college].append(rev_list[6][7:])

### Pre-processing

In [18]:
def read_article(text):
    
    sentences =[]
    
    sentences = sent_tokenize(text)
    for sentence in sentences:
        sentence.replace("[^a-zA-Z0-9]"," ")

    return sentences

In [19]:
def sentence_similarity(sent1,sent2,stopwords=None):
    if stopwords is None:
        stopwords = []
    
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    #build the vector for the first sentence
    for w in sent1:
        if not w in stopwords:
            vector1[all_words.index(w)]+=1
    
    #build the vector for the second sentence
    for w in sent2:
        if not w in stopwords:
            vector2[all_words.index(w)]+=1
            
    return 1-cosine_distance(vector1,vector2)

In [20]:
def build_similarity_matrix(sentences,stop_words):
    #create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences),len(sentences)))
    
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1!=idx2:
                similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],sentences[idx2],stop_words)
                
    return similarity_matrix

In [38]:
def generate_summary(text,top_n):
    

    summarize_text = []
    
    # Step1: read text and tokenize
    sentences = read_article(text)
    sentences= [*set(sentences)]
    # Steo2: generate similarity matrix across sentences
    sentence_similarity_matrix = build_similarity_matrix(sentences,stop_words)
    
    # Step3: Rank sentences in similarirty matrix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    #Step4: sort the rank and place top sentences
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)),reverse=True)
    
    # Step 5: get the top n number of sentences based on rank    
    for i in range(top_n):
        summarize_text.append(ranked_sentences[i][1])
    
    # Step 6 : outpur the summarized version
    return "\n".join(summarize_text),len(sentences)

In [47]:
text = ' '.join(Placement["Reva University"])
print("Placements: "+generate_summary(text,3)[0])
text = ' '.join(Infrastructure["Reva University"])
print("Infrastructure: "+generate_summary(text,3)[0])
lol = ' '.join(Faculty["PES UNIVERSITY"])
print("Faculty: "+generate_summary(lol,3)[0])
text = ' '.join(Other["Reva University"])
print("Other: "+generate_summary(text,3)[0])

Placements: More than 90% of students get placed every year with good salary packages.
The salary package offered ranges between 3 LPA - 40 LPA.
Around 300 students out of 350 got placed in very good companies and rest all opted for higher studies.
Infrastructure: Infrastructure is good and well framed with respect to many centres providing ample of opportunities for the students to create a different vibes Hostel facility are also good providing students to outsource all the college facilities such as gymnastics sports practice libraries etc.
The library facility is very good in our college with a very wide range of books.
Hostels were not good but the food served was good and delicious.
Faculty: Hence their course content is up to date The exams are pretty difficult- they are based solely on the concepts taught in class.
The course curriculum is relevant and helpful to the students when it comes to placements.
Semester exams are quite difficult, and we do not have any internal choice