In [51]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
from gensim.summarization.summarizer import summarize
import json
from datetime import datetime
import sys
import preprocessor as p
import re

## Functions for Preprocessing Tweets

In [52]:
# Set options to be removed
# Note: This may take a while.. Also, ignore the warning

p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.HASHTAG, p.OPT.MENTION)

def preprocess(str):
    
    # Clean URLs, Emojis, Hashtags
    str = p.clean(str)
    
    #Remove all unicode(non-English) tweets
    x = str
    x = x.replace('…','')
    x = x.replace('‘','')
    x = x.replace('’','')
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    x = emoji_pattern.sub(r'', x)
    return x

In [53]:
keyword = "imran khan"

In [54]:
##### Read the file assigned with topics

df = pd.read_json('model/topic-tweets_'+keyword+'_10_topics.json')

df = df.reset_index(drop=True)

In [56]:
##### Extract news tweets from each topic

unique = set(df['Topic'])

no_of_topics = 10

df_topic = []

for i in range(0, no_of_topics):

    if i not in unique:
        continue

    df_i = df.loc[ df['Topic'] == i ]
    
    #x = df_i.nlargest(5, ['Retweet_count'])
    x = df_i.loc[ df['Type'] == "News" ]
    
    if x.empty or len(x) < 3:
        x = df_i.nlargest(5, ['Retweet_count'])
    
    x = x.reset_index()
    
    '''
    if len(x) < 5:
        unique.remove(i)
    else:
        df_topic.append(x)
    '''
    df_topic.append(x)

In [57]:
df_topic[0]

Unnamed: 0,index,Created_time,Description,Favorite_count,Retweet_count,Sentiment,Topic,Twitter_handle,Type,URL,User_name
0,709,2019-12-14 23:18:20,@Umair0094 That's exactly what they wanted. He...,10,1,#8caa0b,3,mariamsmadness,News,twitter.com/i/web/status/1205990436445265921,Mariam's Madness
1,2283,2020-03-02 23:36:01,Saudi Arabia's Prince Khalid bin Salman meets ...,1,0,#b3b3b3,3,pakistaninews,News,twitter.com/i/web/status/1234623525220233216,Pakistan News
2,2336,2020-03-03 15:33:56,Prime Minister of Pakistan Imran Khan speech a...,0,1,#8caa0b,3,InsafPK,News,twitter.com/i/web/status/1234864593069117440,Tehreek-e-Insaf
3,2433,2020-03-04 18:26:22,Chairman Peshawar Zalmi Javed Afridi with capt...,0,8,#b3b3b3,3,TeamJaved,News,twitter.com/i/web/status/1235270376097280001,Javed Afridi FanClub
4,2437,2020-03-04 18:25:52,PM Imran Khan met Zalmi chairman @JAfridi10 Ca...,0,4,#b3b3b3,3,TeamJaved,News,twitter.com/i/web/status/1235270249035030530,Javed Afridi FanClub
5,2441,2020-03-04 18:24:59,Aaj News:\n\nMeeting of Javed Afridi and Darre...,0,0,#b3b3b3,3,TeamJaved,News,twitter.com/i/web/status/1235270027139518464,Javed Afridi FanClub


In [None]:
##### Generate summaries of each topic

summaries = []

for j in range(0, len(df_topic)):        

    topic = ""

    # Extract tweet of particular topic

    for i in range( 0, len(df_topic[j]) ):        
        topic += df_topic[j]['Description'][i] + '. '
    
        
    # Summarize using gensim.summarize
    
    if len(df_topic[j]) < 2:
        summary = topic
    elif len(df_topic[j]) < 3:
        summary = summarize(text=topic, ratio=0.75, split=True)
    else:
        summary = summarize(text=topic, ratio=0.25, split=True)

    # Filter out duplicate sentences

    filtered = list(dict.fromkeys(summary))

    ss = ""
    for i in range(0, len(filtered)):
        ss += filtered[i]
    
    if not ss:
        ss = topic
    
    # Preprocess text
    ss = preprocess(ss)
    
    summaries.append(ss)

In [61]:
##### Get overall sentiment & time of summary

sentiment = []
dates = []
x = 0

for j in range(0, no_of_topics):

    if j not in unique:
        continue

    sent = []

    # Extract tweet of particular topic

    for i in range(0, len(df)):
        if df['Topic'][i].astype(int) == j:
            sent.append(df['Sentiment'][i])

    m = max(sent, key=sent.count)
    sentiment.append(m)

    d = df_topic[x]['Created_time'][0]
    d = datetime.strptime(str(d), '%Y-%m-%d %H:%M:%S')
    dates.append(str(d.date()))
    x += 1

In [62]:
##### Convert list to dataframe

summary_df = pd.DataFrame({"Summary":summaries,"Sentiment":sentiment,"Date":dates})
summary_df = summary_df.reset_index()

In [63]:
##### Add Topic column to each summary

summary_df.rename(columns={'index':'topic'}, inplace=True)

In [64]:
##### Save summaries to json format

summary_json = summary_df.to_dict('records')

file = r'model\summary_' + keyword + '_10_topics.json'
with open(file, 'w') as outfile:
    json.dump(summary_json, outfile)