In [1]:
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from itertools import product
from itertools import permutations
import igraph as ig
%matplotlib inline

In [2]:
#Extract Hashtags List
ht=[]
l=[]
n=0
with open('../../../virality2013/timeline_tag.anony.dat',"r") as f:
    try:
        for line in f:
            hashtag = line.split(" ")[0]
            size = len(line.split(" "))
            ht.append(hashtag)
            l.append(size)
    except:pass
ht_df=pd.DataFrame()
ht_df['hashtag']=ht
ht_df['count_adopters']=l

ht_df = ht_df[ht_df['hashtag'].str.len() > 3] #filter meaningless hashtags
ht_df = ht_df[ht_df['count_adopters'] > 4] #filter meaningless hashtags
ht_df.sort(columns='count_adopters',ascending=False,inplace=True)
ht_df.reset_index(inplace=True,drop=True)

print("Max len for a row : %s" %max(l))
print("Nb of hashtags : %s" %(len(ht)))

Max len for a row : 363519
Nb of hashtags : 1345913




In [3]:
#Fonctions et Préparation des data sets et du graph 
def extract_from_hash(hashtag,file_path,extended=True):
    mat = []    
    if extended == True:
        with open(file_path,"r") as f:
            try:
                for line in f:
                    if hashtag in line:
                        mat.append(line.split(" "))
            except:pass
        pading =    max([len(x) for x in mat])      
        mat = np.array([np.array(pad(x,pading)) for x in mat])
        return pd.DataFrame(mat)
    else:
        with open(file_path,"r") as f:
            try:
                for line in f:
                    if hashtag == line.split(" ")[0]:
                        mat.append(line.split(" "))
                        break
            except:pass
        pading =    max([len(x) for x in mat])      
        mat = np.array([np.array(pad(x,pading)) for x in mat])
        return pd.DataFrame(mat)

def pad(list_,length):
    return list_[:length] + [np.nan]*(length-len(list_))

def find_infected_vertex(df):
    keep=[]
    for col in df.columns[1:]:
        array=split_clean(df[col])
        keep.append([x[1] for x in array if isinstance(x,list)]) #Garde que les user_id
    return list(set([val for sublist in keep for val in sublist]))

def split_clean(list_):
    array= [ x.split(",") for x in list_]
    array= [ np.nan if x[0]=='nan' else [int(x[0]),int(x[1].strip('\n'))] for x in array ]
    return array

def query_from_vertices(df,liste_):
    df=df[(df['user_1'].isin(liste_)) | (df["user_2"].isin(liste_))]
    return df

In [4]:
#Number of early adopters : is the set of distinct adopters in the earliest n tweets of a meme h.
def get_early_adopters(n, df):
    user = []
    for col in df.columns[1:(n+1)]:
        array=split_clean(df[col])
        user.append(array[0][1])
    return user

#Size of first surface
#The second surface includes uninfected users in the second surface of early adopters,
#characterizing the number of potential adopters within two steps
def get_surface(df,liste_):
    df=df[(df['user_1'].isin(liste_)) | (df["user_2"].isin(liste_))]
    df=np.unique(df)
    df = [x for x in df if x not in liste_] #on supprime les nodes infectés
    return df

#Average step distance
def get_average_distance(h):
    average_step_distance=0
    if nx.is_connected(h):
        try:
            average_step_distance = nx.average_shortest_path_length(h)
        except:pass
    else:
        average_step_distance=[]
        for g in nx.connected_component_subgraphs(h):
            try:
                average_step_distance.append(nx.average_shortest_path_length(g))
            except:pass
        try:
            average_step_distance = max(average_step_distance)
        except:pass
    return average_step_distance

#Diameter
#The diameter is the maximum distance between any two adopters of h within the first n tweets.
def get_diameter(h):
    diameter=0
    if nx.is_connected(h):
        try:
            diameter = nx.diameter(h)
        except:pass
    else:
        diameter=[]
        for g in nx.connected_component_subgraphs(h):
            diameter.append(nx.diameter(g))
        try:
            diameter = max(diameter)
        except:pass
    return diameter

#Number of infected communities
def get_infected_communities(h):
    h_igraph = ig.Graph.Adjacency((nx.to_numpy_matrix(h) > 0).tolist())
    communities = ig.Graph.community_infomap(h_igraph)
    count = 0
    for i in ig.clustering.VertexClustering.subgraphs(communities):
        count += 1
    return count

#Average step time duration
def get_timestamps(df, n):
    keep=[]
    for col in df.columns[1:(n+1)]:
        array=split_clean(df[col])
        keep.append([x[0] for x in array if isinstance(x,list)]) #Garde que les timestamps
    return list([val for sublist in keep for val in sublist])

In [5]:
def feature_extraction(h, df, df1):
    early_adopters_list = get_early_adopters(n, df)
    early_adopters = len(set((early_adopters_list)))
    
    first_surface_list = get_surface(df1, early_adopters_list)
    first_surface = len(first_surface_list)
    
    second_surface = len(get_surface(df1, first_surface_list))
    
    average_distance = get_average_distance(h.subgraph(early_adopters_list))
    
    diameter = get_diameter(h.subgraph(early_adopters_list))
    
    infected_communities = get_infected_communities(h.subgraph(early_adopters_list))
    
    timestamps = get_timestamps(df, n)

    average_step_time_duration = (timestamps[n-1] - timestamps[0]) / n-1
    
    root = np.sum((np.asarray(timestamps[1:n]) - np.asarray(timestamps[0:n-1]) - average_step_time_duration)**2)/(n-2)
    cv_step_time_duration = 1/average_step_time_duration * np.sqrt(root)
    
    return early_adopters, first_surface, second_surface, average_distance, diameter, infected_communities, average_step_time_duration, cv_step_time_duration

In [6]:
def get_graph(hashtag):
    
    df = extract_from_hash(hashtag,"../../../virality2013/timeline_tag.anony.dat",extended = False)
    
    if df[0:].shape[1] > 50:
        t=find_infected_vertex(df)
        df_temp = pd.read_csv('../../../virality2013/follower_gcc.anony.dat',sep=' ',header=None,names=["user_1","user_2"])
        df_temp=query_from_vertices(df_temp,t)

        h=nx.from_pandas_dataframe(df_temp,source="user_1",target="user_2") #.to_undirected()
        del df_temp

        h = h.to_undirected()
        
        return h, df
    else:
        return df[0:].shape[1], 0

In [7]:
#Parameters
n = 50


dataset = []
count=len(ht_df['hashtag'])
df1 = pd.read_csv('../../../virality2013/follower_gcc.anony.dat',sep=' ',header=None,names=["user_1","user_2"])
for hashtag in ht_df['hashtag']:
    count -= 1
    h, df = get_graph(hashtag)
    if type(h) !=int:
        early_adopters, first_surface, second_surface, average_distance, diameter, infected_communities, average_step_time_duration, cv_step_time_duration = feature_extraction(h, df, df1)
        dataset.append([hashtag, early_adopters, first_surface, second_surface, average_distance, diameter, infected_communities, average_step_time_duration, cv_step_time_duration])
        print(hashtag, " is done", count, "more tweets to go")
    else:
        print("Not enough tweets, only", h, "tweets")

('oomf', ' is done', 180094, 'more tweets to go')
('teamfollowback', ' is done', 180093, 'more tweets to go')
('bahrain', ' is done', 180092, 'more tweets to go')
('thoughtsduringschool', ' is done', 180091, 'more tweets to go')
('yolo', ' is done', 180090, 'more tweets to go')
('dearoomf', ' is done', 180089, 'more tweets to go')
('retweet', ' is done', 180088, 'more tweets to go')
('taurus', ' is done', 180087, 'more tweets to go')
('followback', ' is done', 180086, 'more tweets to go')
('winning', ' is done', 180085, 'more tweets to go')
('wheniwaslittle', ' is done', 180084, 'more tweets to go')
('capricorn', ' is done', 180083, 'more tweets to go')
('gemini', ' is done', 180082, 'more tweets to go')
('cancer', ' is done', 180081, 'more tweets to go')
('shoutout', ' is done', 180080, 'more tweets to go')
('nowplaying', ' is done', 180079, 'more tweets to go')
('aries', ' is done', 180078, 'more tweets to go')
('imagine', ' is done', 180077, 'more tweets to go')
('pisces', ' is done

KeyboardInterrupt: 

In [8]:
len(dataset)

201

In [9]:
import pickle

with open('features_dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f)

In [10]:
#to load the data:

import pickle
with open('features_dataset.pkl', 'rb') as f:
    new_dataset = pickle.load(f)

In [11]:
ht_df['hashtag'][0:200]

0                                oomf
1                      teamfollowback
2                             bahrain
3                thoughtsduringschool
4                                yolo
5                            dearoomf
6                             retweet
7                              taurus
8                          followback
9                             winning
10                     wheniwaslittle
11                          capricorn
12                             gemini
13                             cancer
14                           shoutout
15                         nowplaying
16                              aries
17                            imagine
18                             pisces
19                      20thingsilove
20                        sagittarius
21                              virgo
22                         dailytweet
23                          boyfriend
24                            scorpio
25                             follow
26          