In [1]:
import pandas as pd
import json
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import spacy
nlp = spacy.load("en_core_web_sm")
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [2]:
df_timewindow05_1 = pd.read_csv('evaluation_1_dbscan.csv',header='infer',engine='python')
df_timewindow05_2 = pd.read_csv('evaluation_1_dbscan_2.csv',header='infer',engine='python')

In [3]:
print(len(df_timewindow05_1))
print(len(df_timewindow05_2))

26239
26245


In [4]:
list_timestamp05_1 = df_timewindow05_1.Timestamp.unique().tolist()
list_timestamp05_2 = df_timewindow05_2.Timestamp.unique().tolist()

In [5]:
def get_adjusted_rand_score(timestamp,df):
    dfchange = df.loc[df.Timestamp==timestamp]
    labels_true = dfchange.Event.tolist()
    labels_pred = dfchange.Cluster.tolist()
    return metrics.adjusted_rand_score(labels_true, labels_pred)


def get_adjusted_mutual_info_score(timestamp,df):
    dfchange = df.loc[df.Timestamp==timestamp]
    labels_true = dfchange.Event.tolist()
    labels_pred = dfchange.Cluster.tolist()
    return metrics.adjusted_mutual_info_score(labels_true, labels_pred)

def get_homogeneity_score(timestamp,df):
    dfchange = df.loc[df.Timestamp==timestamp]
    labels_true = dfchange.Event.tolist()
    labels_pred = dfchange.Cluster.tolist()
    return metrics.homogeneity_score(labels_true, labels_pred)

def get_completeness_score(timestamp,df):
    dfchange = df.loc[df.Timestamp==timestamp]
    labels_true = dfchange.Event.tolist()
    labels_pred = dfchange.Cluster.tolist()
    return metrics.completeness_score(labels_true, labels_pred)

def get_v_measure_score(timestamp,df):
    dfchange = df.loc[df.Timestamp==timestamp]
    labels_true = dfchange.Event.tolist()
    labels_pred = dfchange.Cluster.tolist()
    return metrics.v_measure_score(labels_true, labels_pred)

def get_fowlkes_mallows_score(timestamp,df):
    dfchange = df.loc[df.Timestamp==timestamp]
    labels_true = dfchange.Event.tolist()
    labels_pred = dfchange.Cluster.tolist()
    return metrics.fowlkes_mallows_score(labels_true, labels_pred)

In [6]:
def getAllMetrics(df,list_timestamp):
    df_evaluation = pd.DataFrame()
    df_evaluation['Timestamp'] = np.array([timestamp for timestamp in list_timestamp])
    df_evaluation['adjusted_rand_score'] = np.array([round(get_adjusted_rand_score(timestamp,df),1) for timestamp in list_timestamp])
    df_evaluation['adjusted_mutual_info_score'] = np.array([round(get_adjusted_mutual_info_score(timestamp,df),1) for timestamp in list_timestamp])
    df_evaluation['homogeneity_score'] = np.array([round(get_homogeneity_score(timestamp,df),1) for timestamp in list_timestamp])
    df_evaluation['completeness_score'] = np.array([round(get_completeness_score(timestamp,df),1) for timestamp in list_timestamp])
    df_evaluation['v_measure_score'] = np.array([round(get_v_measure_score(timestamp,df),1) for timestamp in list_timestamp])
    df_evaluation['fowlkes_mallows_score'] = np.array([round(get_fowlkes_mallows_score(timestamp,df),1) for timestamp in list_timestamp])
    df_evaluation['Number_Of_Clusters'] = np.array([df.loc[df['Timestamp']==timestamp].groupby(['Cluster']).count().shape[0] for timestamp in list_timestamp])
    return df_evaluation

In [7]:
df_evaluation_05_1 = getAllMetrics(df_timewindow05_1,list_timestamp05_1)
df_evaluation_05_2= getAllMetrics(df_timewindow05_2,list_timestamp05_2)

In [8]:
list_predict_4 = df_timewindow05_1.Cluster.unique().tolist()
list_predict_5 = df_timewindow05_2.Cluster.unique().tolist()

In [9]:
list_predict_5

[1,
 0,
 2,
 4,
 3,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [10]:
def getLabel(list_event):
    try:
        temp = list_event.value_counts()
        return temp.keys()[0]
    except:
        return -1

In [11]:
def findPurity(temp_df):
    try:
        label = getLabel(temp_df['Event'])
        n1 = temp_df.loc[temp_df['Event']==label].shape[0]
        n2 = temp_df.shape[0]
        return n1/n2
    except:
        return -1

In [12]:
def getDataframePurity(df,list_predict):
    df_purity = pd.DataFrame()
    df_purity['Cluster'] = np.array([cluster for cluster in list_predict])
#     print(df_purity['Cluster'])
    df_purity['Predicted_Event'] = np.array([int(getLabel(df.loc[df['Cluster']==cluster]['Event'])) for cluster in list_predict])
    df_purity['Purity'] = np.array([round(findPurity(df.loc[df['Cluster']==cluster]),1) for cluster in list_predict])
    return df_purity

In [13]:
import ast
# df_timewindow05_1.loc[df_timewindow05_1['Cluster']==10]
df_timewindow05_1['Event'] = np.array([ast.literal_eval(temp) for temp in df_timewindow05_1['Event']])
df_timewindow05_2['Event'] = np.array([ast.literal_eval(temp) for temp in df_timewindow05_2['Event']])

In [14]:
df_timewindow05_2.loc[df_timewindow05_2['Cluster']==10]

Unnamed: 0,tweets,Timestamp,Cluster,Event
168,"Why everybody keep saying Bet Awards ! Its "" H...",20,10,[394]
169,Watchin BET HIP~Hop Adwards,20,10,[394]
170,Hiphop awards #bet,20,10,[394]
171,β€�@iamCuDii: Cyphers are the only reason i wa...,20,10,[-1]
172,Damn I forgot the HipHop Awards was coming on.,20,10,[394]
173,Who got a link for the BET awards??,20,10,[394]
174,i forgot bout the damn awards fuck missed 30mi...,20,10,[-1]
175,Bet awards a mad thing,20,10,[394]
176,Almost forgot about the hiphop awards,20,10,[394]
177,Ian even see the beginning of the hiphop award...,20,10,[394]


In [15]:
df_timewindow05_1

Unnamed: 0,tweets,Timestamp,Cluster,Event
0,Mike Epps is silly..,4,1,[-1]
1,mike epps though lol,4,1,[-1]
2,Mike epps a fool! lol,4,1,[-1]
3,Mike Epps is hilarious!!!!,4,1,[-1]
4,MiKE EPPS A FOOL!!!!,4,1,[-1]
...,...,...,...,...
26234,If Romney wins... Stfu none of you will do any...,39939,2168,[52]
26235,I hope Romney wins,39939,2168,[52]
26236,"If Romney wins , we are legit FUCKED !",39939,2168,[52]
26237,I hope this Romney fella doesn't win American ...,39939,2168,[52]


In [16]:
# df_timewindow05_1.loc[df_timewindow05_1['Cluster']==10]
df_timewindow05_1['sum_count'] = np.array([len(temp) for temp in df_timewindow05_1['Event']])
df_timewindow05_2['sum_count'] = np.array([len(temp) for temp in df_timewindow05_2['Event']])

In [17]:
temp_list = []
for list_events in df_timewindow05_1['Event']:
    for event in list_events:
        if(event!=-1):
            temp_list.append(event)

In [18]:
len(temp_list)

25435

In [19]:
temp_list_2 = []
for list_events in df_timewindow05_1['Event']:
    for event in list_events:
        if(event!=-1):
            temp_list_2.append(event)

In [20]:
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
    return unique_list
final_list = unique(temp_list)

In [21]:
final_list_2 = unique(temp_list_2)

In [22]:
len(final_list)

336

In [39]:
len(final_list)

336

In [40]:
len(final_list_2)

336

In [24]:
df = pd.read_csv('clean_relevant_tweets.csv',header='infer',engine='python')
df['Event'] = np.array([ast.literal_eval(temp) for temp in df['Event']])

In [25]:
df_list = []
for list_events in df['Event']:
    for event in list_events:
        if(event!=-1):
            df_list.append(event)

In [26]:
final_df_list = unique(df_list)
len(final_df_list)

458

In [27]:
for event in final_list:
    final_df_list.remove(event)

In [72]:
final_df_list

[352,
 400,
 73,
 74,
 82,
 86,
 93,
 97,
 111,
 118,
 119,
 120,
 128,
 132,
 134,
 142,
 149,
 150,
 161,
 174,
 260,
 189,
 205,
 197,
 491,
 207,
 209,
 210,
 214,
 215,
 395,
 225,
 227,
 228,
 229,
 233,
 305,
 242,
 244,
 431,
 247,
 248,
 251,
 252,
 253,
 257,
 258,
 485,
 267,
 269,
 272,
 274,
 277,
 477,
 285,
 291,
 292,
 294,
 296,
 297,
 306,
 304,
 308,
 310,
 318,
 324,
 325,
 331,
 333,
 335,
 334,
 336,
 338,
 339,
 340,
 346,
 351,
 356,
 359,
 361,
 362,
 368,
 370,
 369,
 376,
 375,
 374,
 373,
 382,
 383,
 384,
 396,
 398,
 399,
 409,
 411,
 414,
 416,
 417,
 423,
 428,
 474,
 433,
 432,
 442,
 443,
 448,
 452,
 455,
 457,
 458,
 460,
 461,
 465,
 466,
 473,
 489,
 494,
 496,
 497,
 499,
 503]

In [53]:
df2 = pd.DataFrame(columns=df.columns)

In [54]:
count=10000
list_rows = []
for index,row in df.iterrows():
    if(count<index):
        print(count)
        count+=10000
    list_event = row['Event']
    for event in list_event:
        row['Event'] = event
        list_rows.append(row)
df2 = df2.append(list_rows)

10000
20000
30000
40000
50000
60000


In [55]:
import timestamp_graphs as tg
df2 = df2.reset_index(drop=True)
df2 = tg.createTimestamps(df2)

In [61]:
df2.columns

Index(['tweets', 'created_at', 'id', 'rt', 'lang', 'urls', 'Event',
       'count_event', 'date', 'time', 'Datetime', 'DateHour',
       'Date_Ten_Minutes'],
      dtype='object')

In [74]:
df2.loc[df2['Event']==2].groupby(['DateHour']).count()

Unnamed: 0_level_0,tweets,created_at,id,rt,lang,urls,Event,count_event,date,time,Datetime,Date_Ten_Minutes
DateHour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012-10-11 17:00:00,5,5,5,5,5,5,5,5,5,5,5,5
2012-10-12 06:00:00,1,1,1,1,1,1,1,1,1,1,1,1
2012-10-12 07:00:00,9,9,9,9,9,9,9,9,9,9,9,9
2012-10-12 08:00:00,25,25,25,25,25,25,25,25,25,25,25,25
2012-10-12 09:00:00,325,325,325,325,325,325,325,325,325,325,325,325
2012-10-12 10:00:00,25,25,25,25,25,25,25,25,25,25,25,25
2012-10-12 11:00:00,20,20,20,20,20,20,20,20,20,20,20,20
2012-10-12 12:00:00,19,19,19,19,19,19,19,19,19,19,19,19
2012-10-12 13:00:00,12,12,12,12,12,12,12,12,12,12,12,12
2012-10-12 14:00:00,9,9,9,9,9,9,9,9,9,9,9,9
