In [1]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
from gensim.summarization.summarizer import summarize

In [2]:
# Read the file assigned with topics

df = pd.read_json('model/topic-tweets_imran khan_15_topics.json')
df.head()

Unnamed: 0,Created_time,URL,User_name,Twitter_handle,Description,Retweet_count,Favorite_count,Sentiment,Topic
0,2019-11-30 23:52:15,twitter.com/i/web/status/1200925541228064770,kami Khan,saleemkamikhan,PM Imran Khan lists down five strategies to co...,14,0,#ff0000,13
1,2019-11-30 23:52:03,twitter.com/i/web/status/1200925489742974976,Irfan Ullah,IrfanUK11,@GulBukhari I just dont understand country was...,0,1,#8caa0b,10
10,2019-11-30 23:36:37,twitter.com/i/web/status/1200921604370374661,mazloomkashur,hurtkashmiri,#Kashmir already has settlements of million ar...,0,0,#8caa0b,6
100,2019-12-01 23:43:00,twitter.com/i/web/status/1201285598658600961,News18,CNNnews18,"""The old system can no longer work in Naya Pak...",2,10,#ff0000,9
1000,2019-12-12 14:06:49,twitter.com/i/web/status/1205126865813458944,حیدر بلوچ,H3iD4r,Prime Minister Imran Khan took notice of the d...,0,0,#ff0000,11


In [3]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Created_time,URL,User_name,Twitter_handle,Description,Retweet_count,Favorite_count,Sentiment,Topic
0,2019-11-30 23:52:15,twitter.com/i/web/status/1200925541228064770,kami Khan,saleemkamikhan,PM Imran Khan lists down five strategies to co...,14,0,#ff0000,13
1,2019-11-30 23:52:03,twitter.com/i/web/status/1200925489742974976,Irfan Ullah,IrfanUK11,@GulBukhari I just dont understand country was...,0,1,#8caa0b,10
2,2019-11-30 23:36:37,twitter.com/i/web/status/1200921604370374661,mazloomkashur,hurtkashmiri,#Kashmir already has settlements of million ar...,0,0,#8caa0b,6
3,2019-12-01 23:43:00,twitter.com/i/web/status/1201285598658600961,News18,CNNnews18,"""The old system can no longer work in Naya Pak...",2,10,#ff0000,9
4,2019-12-12 14:06:49,twitter.com/i/web/status/1205126865813458944,حیدر بلوچ,H3iD4r,Prime Minister Imran Khan took notice of the d...,0,0,#ff0000,11


In [4]:
# Generate summaries of each topic

no_of_topics = 15
summaries = []

for j in range(0, no_of_topics):
    
    topic = ""
    
    # Extract tweet of particular topic
    
    for i in range(0, len(df)):
        if df['Topic'][i].astype(int) == j:
            topic += df['Description'][i] + '. '
    
    # Summarize using gensim.summarize
    
    summary = summarize(text=topic, split=True)
    
    # Filter out duplicate sentences
    
    filtered = list(dict.fromkeys(summary))
    
    ss = ""
    for i in range(0, 5):
        ss += filtered[i]
    
    summaries.append(ss)

In [5]:
# Convert list to dataframe

summary_df = pd.DataFrame(summaries,columns=["Summary"])
summary_df.head()

Unnamed: 0,Summary
0,major relief to Pakistan's ailing former Prime...
1,Like Shabaz did under Nawaz's regimes campaig...
2,Imran Khan is more concern about Indian Muslim...
3,MusharrafImran Khan tolds the story of Army Ch...
4,Pakistan prime minister Imran Khan claims tree...


In [6]:
summary_df = summary_df.reset_index()

In [7]:
# Add Topic column to each summary

summary_df.rename(columns={'index':'topic'}, inplace=True)
summary_df.head()

Unnamed: 0,topic,Summary
0,0,major relief to Pakistan's ailing former Prime...
1,1,Like Shabaz did under Nawaz's regimes campaig...
2,2,Imran Khan is more concern about Indian Muslim...
3,3,MusharrafImran Khan tolds the story of Army Ch...
4,4,Pakistan prime minister Imran Khan claims tree...


In [8]:
# Save summaries to json format

summary_df.to_json('model/summary_imran khan_15_topics.json')

In [9]:
# Method to find Cosine Similarity

def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [None]:
sum_cosine = 0

for j in range(0, no_of_topics):
    
    topic = ""
    for i in range(0, len(df)):
        if df['Topic'][i].astype(int) == j:
            topic += df['Description'][i] + '. '
    
    ratio = get_cosine_sim(topic, summary_df['Summary'][j])[0][1]
    sum_cosine += ratio
    
print("Average cosine similarity: ", sum_cosine